Instead of using get_user_pages_fast() and kmap_atomic() when writing to the trace_marker file, just allocate enough space on the ring buffer directly, and write into it via copy_from_user(). Writing into the trace_marker file use to allocate a temporary buffer to perform the copy_from_user(), as we didn't want to write into the ring buffer if the copy failed. But as a trace_marker write is suppose to be extremely fast, and allocating memory causes other tracepoints to trigger, Peter Zijlstra suggested using get_user_pages_fast() and kmap_atomic() to keep the user space pages in memory and reading it directly. Instead, just allocate the space in the ring buffer and use copy_from_user() directly. If it faults, return -EFAULT and write "<faulted>" into the ring buffer. On architectures without a arch-specific get_user_pages_fast(), this will end up in the generic get_user_pages_fast() and this grabs mm->mmap_sem. Once you do this, then suddenly writing to the trace_marker can cause priority-inversions. This is a backport of Steven Rostedts patch [1] and applied to 3.10.x so the signed-off-chain by is somewhat uncertain at this stage. The patch compiles, boots and does not immediately explode on impact. By definition [2] it must therefore be perfect 2) https://www.spinics.net/lists/kernel/msg2400769.html 2) http://lkml.iu.edu/hypermail/linux/kernel/9804.1/0149.html Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Henrik Austad <henrik@xxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Steven Rostedt <rostedt@xxxxxxxxxxx> Cc: stable@xxxxxxxxxxxxxxx Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Used-to-be-signed-off-by: Steven Rostedt <rostedt@xxxxxxxxxxx> Backported-by: Henrik Austad <haustad@xxxxxxxxx> Tested-by: Henrik Austad <haustad@xxxxxxxxx> Signed-off-by: Henrik Austad <haustad@xxxxxxxxx> --- kernel/trace/trace.c | 78 +++++++++++++++------------------------------------- 1 file changed, 22 insertions(+), 56 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 18cdf91..94eb1ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4501,15 +4501,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - struct page *pages[2]; - void *map_page[2]; - int nr_pages = 1; + const char faulted[] = "<faulted>"; ssize_t written; - int offset; int size; int len; - int ret; - int i; + +/* Used in tracing_mark_raw_write() as well */ +#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; @@ -4520,60 +4518,34 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - /* - * Userspace is injecting traces into the kernel trace buffer. - * We want to be as non intrusive as possible. - * To do so, we do not want to allocate any special buffers - * or take any locks, but instead write the userspace data - * straight into the ring buffer. - * - * First we need to pin the userspace buffer into memory, - * which, most likely it is, because it just referenced it. - * But there's no guarantee that it is. By using get_user_pages_fast() - * and kmap_atomic/kunmap_atomic() we can get access to the - * pages directly. We then write the data directly into the - * ring buffer. - */ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - /* check if we cross pages */ - if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) - nr_pages = 2; - - offset = addr & (PAGE_SIZE - 1); - addr &= PAGE_MASK; - - ret = get_user_pages_fast(addr, nr_pages, 0, pages); - if (ret < nr_pages) { - while (--ret >= 0) - put_page(pages[ret]); - written = -EFAULT; - goto out; - } + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ - for (i = 0; i < nr_pages; i++) - map_page[i] = kmap_atomic(pages[i]); + /* If less than "<faulted>", then make sure we can still add that */ + if (cnt < FAULTED_SIZE) + size += FAULTED_SIZE - cnt; - local_save_flags(irq_flags); - size = sizeof(*entry) + cnt + 2; /* possible \n added */ buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); - if (!event) { - /* Ring buffer disabled, return as if not open for write */ - written = -EBADF; - goto out_unlock; - } + + if (unlikely(!event)) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; - if (nr_pages == 2) { - len = PAGE_SIZE - offset; - memcpy(&entry->buf, map_page[0] + offset, len); - memcpy(&entry->buf[len], map_page[1], cnt - len); + len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + if (len) { + memcpy(&entry->buf, faulted, FAULTED_SIZE); + cnt = FAULTED_SIZE; + written = -EFAULT; } else - memcpy(&entry->buf, map_page[0] + offset, cnt); + written = cnt; + len = cnt; if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; @@ -4583,16 +4555,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - written = cnt; - *fpos += written; + if (written > 0) + *fpos += written; - out_unlock: - for (i = 0; i < nr_pages; i++){ - kunmap_atomic(map_page[i]); - put_page(pages[i]); - } - out: return written; } -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html