Re: [PATCH] x86/traps: Don't for in_interrupt() to return true in IST handlers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, May 24, 2016 at 10:09 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Tue, May 24, 2016 at 08:43:57AM -0700, Andy Lutomirski wrote:
>> > So this has implications for code like
>> > kernel/events/internal.h:get_recursion_context() and
>> > kernel/trace/trace.c:get_trace_buf().
>> >
>> > Which use a sequence of: in_nmi(), in_irq(), in_softirq() to pick 1 out
>> > of 4 possible contexts.
>> >
>> > I would really like the Changelog to reflect on this. The current code
>> > will have ISTs land in in_irq(), with this chance, not so much.
>>
>> I can change the changelog.
>
> This is basically all I'm asking.
>
>> > Now ISTs as a whole invalidate the whole 'we have only 4 contexts' and
>> > the mapping back to those 4 is going to be somewhat arbitrary I suspect,
>> > but changes like this should be very much aware of these things. And
>> > make an explicit choice.
>>
>> I'm not so comfortable with trying to make any particular guarantees
>> about what all the in_xyz() things will return for different entry
>> types and how they nest.  For example, debug can nest inside itself
>> quite easily (at one point I even had a user program to force it to
>> happen) -- this can trigger when a watchpoint nests inside a
>> single-step trap, and it can also happen when a watchpoint handler is
>> interrupted by an NMI than then recursively triggers the watchpoint.
>> The latter could easily result in nested NMIs that are directly
>> visible to the trace or event code.
>>
>> On x86, there's also MCE, which is NMI-ish, and NMI and MCE can freely
>> nest inside each other.  (Blech.)
>
> Right; all the nesting possibilities are endless and insane. And I don't
> think it makes sense to try and enumerate them all and worse; expose
> this to generic code.
>
> I think having the 4: task, softirq, hardirq, nmi is fine. We just need
> to be a little careful with how we map to them, that we don't wreck some
> common situation and suddenly start loosing trace data.
>
>> Would it make more sense to adjust the trace code to have a percpu
>> nesting count and to match up get_trace_buf with put_trace_buf to
>> decrement the count?  The event code looks like the same thing could
>> happen.
>
> We have, but its coupled with static storage, such as to avoid having to
> do it on stack or *horror* dynamically allocate.
>
> So in order to protect these resource we have the per context nesting
> count.


I gave it a shot and it looks straightforward and is a net deletion of
code.  It's attached and compile-tested only.  Is there a reason you
don't like this approach?

--Andy
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4b49c99f22..acd366e4ab1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1986,83 +1986,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 
 /* created for use with alloc_percpu */
 struct trace_buffer_struct {
-	char buffer[TRACE_BUF_SIZE];
+	int nesting;
+	char buffer[4][TRACE_BUF_SIZE];
 };
 
 static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
 
 /*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording.  If we're nested too deeply, then
+ * this returns NULL.
  */
 static char *get_trace_buf(void)
 {
-	struct trace_buffer_struct *percpu_buffer;
-
-	/*
-	 * If we have allocated per cpu buffers, then we do not
-	 * need to do any locking.
-	 */
-	if (in_nmi())
-		percpu_buffer = trace_percpu_nmi_buffer;
-	else if (in_irq())
-		percpu_buffer = trace_percpu_irq_buffer;
-	else if (in_softirq())
-		percpu_buffer = trace_percpu_sirq_buffer;
-	else
-		percpu_buffer = trace_percpu_buffer;
+	struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
 
-	if (!percpu_buffer)
+	if (!buffer || buffer->nesting >= 4)
 		return NULL;
 
-	return this_cpu_ptr(&percpu_buffer->buffer[0]);
+	return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+	this_cpu_dec(trace_percpu_buffer->nesting);
 }
 
 static int alloc_percpu_trace_buffer(void)
 {
 	struct trace_buffer_struct *buffers;
-	struct trace_buffer_struct *sirq_buffers;
-	struct trace_buffer_struct *irq_buffers;
-	struct trace_buffer_struct *nmi_buffers;
 
 	buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!buffers)
-		goto err_warn;
-
-	sirq_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!sirq_buffers)
-		goto err_sirq;
-
-	irq_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!irq_buffers)
-		goto err_irq;
-
-	nmi_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!nmi_buffers)
-		goto err_nmi;
+	if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+		return -ENOMEM;
 
 	trace_percpu_buffer = buffers;
-	trace_percpu_sirq_buffer = sirq_buffers;
-	trace_percpu_irq_buffer = irq_buffers;
-	trace_percpu_nmi_buffer = nmi_buffers;
-
 	return 0;
-
- err_nmi:
-	free_percpu(irq_buffers);
- err_irq:
-	free_percpu(sirq_buffers);
- err_sirq:
-	free_percpu(buffers);
- err_warn:
-	WARN(1, "Could not allocate percpu trace_printk buffer");
-	return -ENOMEM;
 }
 
 static int buffers_allocated;
@@ -2153,7 +2111,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	tbuffer = get_trace_buf();
 	if (!tbuffer) {
 		len = 0;
-		goto out;
+		goto out_nobuffer;
 	}
 
 	len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2179,6 +2137,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	}
 
 out:
+	put_trace_buf();
+
+out_nobuffer:
 	preempt_enable_notrace();
 	unpause_graph_tracing();
 
@@ -2210,7 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 	tbuffer = get_trace_buf();
 	if (!tbuffer) {
 		len = 0;
-		goto out;
+		goto out_nobuffer;
 	}
 
 	len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2229,7 +2190,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 		__buffer_unlock_commit(buffer, event);
 		ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
 	}
- out:
+
+out_nobuffer:
+	put_trace_buf();
+
+out:
 	preempt_enable_notrace();
 	unpause_graph_tracing();
 

[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]