Another reason this is bogus, confirmed by Jean-Marc's testing: printk is called too often without valid 'current', i.e. when network packets arrive and are processed. So we should send output of sys_syslog() to current's ring buffer, all printks to the init_syslog_ns, and then we can use ns_printk(syslog_ns, fmt, ...) for targeted printks. And, as discussed on irc, we'll make syslog_ns a full namespace in its own right, use the last remaining clone flag (if there is one) or build on top of eclone(). It'd be nicer to have a 'real' clone flag so we can also unshare(CLONE_NEWLOG). -serge Quoting Serge E. Hallyn (serue@xxxxxxxxxx): > Provide each user namespace with its own syslog ringbuffer. > > So you can do > ns_exec -cU /bin/bash > dmesg > and see nothing. Root in a container (with private user namespace) > cannot clear the host's ring buffer. > > Since containers do not have a notion of consoles at present, > only the initial user namespace deals with console output or > with the console-related syslog commands. > > This opens the door to targetting printk at certain syslog > namespaces. It's not safe to be applied - it's a quick-n-dirty > hack and won't even compile for CONFIG_PRINTK=n. Also I've not decided > what to do about duplication of printks to init_user_ns so for > now emit_one_char always duplicates to inti_user_ns. We probably > want to be smarter about this and output a prefix indicating the > target. > > But I figured discussions about the API would be more meaningful > with a testable patch. > > --- > fs/proc/kmsg.c | 5 +- > include/linux/user_namespace.h | 2 + > kernel/printk.c | 225 ++++++++++++++++++++++++++-------------- > kernel/user.c | 4 + > kernel/user_namespace.c | 13 +++ > 5 files changed, 168 insertions(+), 81 deletions(-) > > diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c > index 7ca7834..2746b70 100644 > --- a/fs/proc/kmsg.c > +++ b/fs/proc/kmsg.c > @@ -12,11 +12,12 @@ > #include <linux/poll.h> > #include <linux/proc_fs.h> > #include <linux/fs.h> > +#include <linux/syslog.h> > > #include <asm/uaccess.h> > #include <asm/io.h> > > -extern wait_queue_head_t log_wait; > +extern struct syslog_ns init_syslog_ns; > > extern int do_syslog(int type, char __user *bug, int count); > > @@ -41,7 +42,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, > > static unsigned int kmsg_poll(struct file *file, poll_table *wait) > { > - poll_wait(file, &log_wait, wait); > + poll_wait(file, &init_syslog_ns.wait, wait); > if (do_syslog(9, NULL, 0)) > return POLLIN | POLLRDNORM; > return 0; > diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h > index cc4f453..3926c89 100644 > --- a/include/linux/user_namespace.h > +++ b/include/linux/user_namespace.h > @@ -5,6 +5,7 @@ > #include <linux/nsproxy.h> > #include <linux/sched.h> > #include <linux/err.h> > +#include <linux/syslog.h> > > #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) > #define UIDHASH_SZ (1 << UIDHASH_BITS) > @@ -14,6 +15,7 @@ struct user_namespace { > struct hlist_head uidhash_table[UIDHASH_SZ]; > struct user_struct *creator; > struct work_struct destroyer; > + struct syslog_ns *syslog; > }; > > extern struct user_namespace init_user_ns; > diff --git a/kernel/printk.c b/kernel/printk.c > index 1751c45..5b93447 100644 > --- a/kernel/printk.c > +++ b/kernel/printk.c > @@ -35,9 +35,18 @@ > #include <linux/kexec.h> > #include <linux/ratelimit.h> > #include <linux/kmsg_dump.h> > +#include <linux/user_namespace.h> > > #include <asm/uaccess.h> > > +struct syslog_ns init_syslog_ns; > +#define g_log_wait (init_syslog_ns.wait) > +#define g_log_start (init_syslog_ns.start) > +#define g_log_end (init_syslog_ns.end) > +#define g_log_buf_len (init_syslog_ns.buf_len) > +#define g_logged_chars (init_syslog_ns.logged_chars) > +#define g_log_buf (init_syslog_ns.buf) > + > /* > * for_each_console() allows you to iterate on each console > */ > @@ -52,6 +61,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) > } > > #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) > +#define CONTAINER_BUF_LEN 4096 > > /* printk's without a loglevel use this.. */ > #define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ > @@ -60,8 +70,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) > #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ > #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ > > -DECLARE_WAIT_QUEUE_HEAD(log_wait); > - > int console_printk[4] = { > DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ > DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ > @@ -98,22 +106,20 @@ EXPORT_SYMBOL_GPL(console_drivers); > static int console_locked, console_suspended; > > /* > - * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars > + * logbuf_lock protects g_log_buf, g_log_start, g_log_end, con_start and g_logged_chars > * It is also used in interesting ways to provide interlocking in > * release_console_sem(). > */ > static DEFINE_SPINLOCK(logbuf_lock); > > -#define LOG_BUF_MASK (log_buf_len-1) > -#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) > +#define LOG_BUF_MASK(ns) ((ns)->buf_len-1) > +#define LOG_BUF(ns, idx) ((ns)->buf[(idx) & LOG_BUF_MASK(ns)]) > > /* > - * The indices into log_buf are not constrained to log_buf_len - they > + * The indices into g_log_buf are not constrained to g_log_buf_len - they > * must be masked before subscripting > */ > -static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ > -static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ > -static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ > +static unsigned con_start; /* Index into g_log_buf: next char to be sent to consoles */ > > /* > * Array of consoles built from command line options (console=) > @@ -142,9 +148,6 @@ static int console_may_schedule; > #ifdef CONFIG_PRINTK > > static char __log_buf[__LOG_BUF_LEN]; > -static char *log_buf = __log_buf; > -static int log_buf_len = __LOG_BUF_LEN; > -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ > > #ifdef CONFIG_KEXEC > /* > @@ -157,10 +160,10 @@ static unsigned logged_chars; /* Number of chars produced since last read+clear > */ > void log_buf_kexec_setup(void) > { > - VMCOREINFO_SYMBOL(log_buf); > - VMCOREINFO_SYMBOL(log_end); > - VMCOREINFO_SYMBOL(log_buf_len); > - VMCOREINFO_SYMBOL(logged_chars); > + VMCOREINFO_SYMBOL(g_log_buf); > + VMCOREINFO_SYMBOL(g_log_end); > + VMCOREINFO_SYMBOL(g_log_buf_len); > + VMCOREINFO_SYMBOL(g_logged_chars); > } > #endif > > @@ -171,7 +174,7 @@ static int __init log_buf_len_setup(char *str) > > if (size) > size = roundup_pow_of_two(size); > - if (size > log_buf_len) { > + if (size > g_log_buf_len) { > unsigned start, dest_idx, offset; > char *new_log_buf; > > @@ -182,22 +185,22 @@ static int __init log_buf_len_setup(char *str) > } > > spin_lock_irqsave(&logbuf_lock, flags); > - log_buf_len = size; > - log_buf = new_log_buf; > + g_log_buf_len = size; > + g_log_buf = new_log_buf; > > - offset = start = min(con_start, log_start); > + offset = start = min(con_start, g_log_start); > dest_idx = 0; > - while (start != log_end) { > - log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; > + while (start != g_log_end) { > + g_log_buf[dest_idx] = g_log_buf[start & (__LOG_BUF_LEN - 1)]; > start++; > dest_idx++; > } > - log_start -= offset; > + g_log_start -= offset; > con_start -= offset; > - log_end -= offset; > + g_log_end -= offset; > spin_unlock_irqrestore(&logbuf_lock, flags); > > - printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); > + printk(KERN_NOTICE "log_buf_len: %d\n", g_log_buf_len); > } > out: > return 1; > @@ -279,6 +282,7 @@ int do_syslog(int type, char __user *buf, int len) > int do_clear = 0; > char c; > int error = 0; > + struct syslog_ns *syslog_ns = current_user_ns()->syslog; > > error = security_syslog(type); > if (error) > @@ -300,15 +304,17 @@ int do_syslog(int type, char __user *buf, int len) > error = -EFAULT; > goto out; > } > - error = wait_event_interruptible(log_wait, > - (log_start - log_end)); > + error = wait_event_interruptible(syslog_ns->wait, > + (syslog_ns->start - syslog_ns->end)); > if (error) > goto out; > i = 0; > spin_lock_irq(&logbuf_lock); > - while (!error && (log_start != log_end) && i < len) { > - c = LOG_BUF(log_start); > - log_start++; > + while (!error && > + (syslog_ns->start != syslog_ns->end) > + && i < len) { > + c = LOG_BUF(syslog_ns, syslog_ns->start); > + syslog_ns->start++; > spin_unlock_irq(&logbuf_lock); > error = __put_user(c,buf); > buf++; > @@ -335,14 +341,14 @@ int do_syslog(int type, char __user *buf, int len) > goto out; > } > count = len; > - if (count > log_buf_len) > - count = log_buf_len; > + if (count > syslog_ns->buf_len) > + count = syslog_ns->buf_len; > spin_lock_irq(&logbuf_lock); > - if (count > logged_chars) > - count = logged_chars; > + if (count > syslog_ns->logged_chars) > + count = syslog_ns->logged_chars; > if (do_clear) > - logged_chars = 0; > - limit = log_end; > + syslog_ns->logged_chars = 0; > + limit = syslog_ns->end; > /* > * __put_user() could sleep, and while we sleep > * printk() could overwrite the messages > @@ -351,9 +357,9 @@ int do_syslog(int type, char __user *buf, int len) > */ > for (i = 0; i < count && !error; i++) { > j = limit-1-i; > - if (j + log_buf_len < log_end) > + if (j + syslog_ns->buf_len < syslog_ns->end) > break; > - c = LOG_BUF(j); > + c = LOG_BUF(syslog_ns, j); > spin_unlock_irq(&logbuf_lock); > error = __put_user(c,&buf[count-1-i]); > cond_resched(); > @@ -377,20 +383,32 @@ int do_syslog(int type, char __user *buf, int len) > } > break; > case 5: /* Clear ring buffer */ > - logged_chars = 0; > + syslog_ns->logged_chars = 0; > break; > case 6: /* Disable logging to console */ > + if (syslog_ns != &init_syslog_ns) { > + error = -EPERM; > + break; > + } > if (saved_console_loglevel == -1) > saved_console_loglevel = console_loglevel; > console_loglevel = minimum_console_loglevel; > break; > case 7: /* Enable logging to console */ > + if (syslog_ns != &init_syslog_ns) { > + error = -EPERM; > + break; > + } > if (saved_console_loglevel != -1) { > console_loglevel = saved_console_loglevel; > saved_console_loglevel = -1; > } > break; > case 8: /* Set level of messages printed to console */ > + if (syslog_ns != &init_syslog_ns) { > + error = -EPERM; > + break; > + } > error = -EINVAL; > if (len < 1 || len > 8) > goto out; > @@ -402,10 +420,10 @@ int do_syslog(int type, char __user *buf, int len) > error = 0; > break; > case 9: /* Number of chars in the log buffer */ > - error = log_end - log_start; > + error = syslog_ns->end - syslog_ns->start; > break; > case 10: /* Size of the log buffer */ > - error = log_buf_len; > + error = syslog_ns->buf_len; > break; > default: > error = -EINVAL; > @@ -421,7 +439,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) > } > > /* > - * Call the console drivers on a range of log_buf > + * Call the console drivers on a range of g_log_buf > */ > static void __call_console_drivers(unsigned start, unsigned end) > { > @@ -431,7 +449,8 @@ static void __call_console_drivers(unsigned start, unsigned end) > if ((con->flags & CON_ENABLED) && con->write && > (cpu_online(smp_processor_id()) || > (con->flags & CON_ANYTIME))) > - con->write(con, &LOG_BUF(start), end - start); > + con->write(con, &LOG_BUF(&init_syslog_ns, start), > + end - start); > } > } > > @@ -455,11 +474,14 @@ static void _call_console_drivers(unsigned start, > { > if ((msg_log_level < console_loglevel || ignore_loglevel) && > console_drivers && start != end) { > - if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { > + if ((start & LOG_BUF_MASK(&init_syslog_ns)) > > + (end & LOG_BUF_MASK(&init_syslog_ns))) { > /* wrapped write */ > - __call_console_drivers(start & LOG_BUF_MASK, > - log_buf_len); > - __call_console_drivers(0, end & LOG_BUF_MASK); > + __call_console_drivers(start & > + LOG_BUF_MASK(&init_syslog_ns), > + g_log_buf_len); > + __call_console_drivers(0, > + end & LOG_BUF_MASK(&init_syslog_ns)); > } else { > __call_console_drivers(start, end); > } > @@ -468,13 +490,14 @@ static void _call_console_drivers(unsigned start, > > /* > * Call the console drivers, asking them to write out > - * log_buf[start] to log_buf[end - 1]. > + * g_log_buf[start] to g_log_buf[end - 1]. > * The console_sem must be held. > */ > static void call_console_drivers(unsigned start, unsigned end) > { > unsigned cur_index, start_print; > static int msg_level = -1; > + static struct syslog_ns *ns = &init_syslog_ns; > > BUG_ON(((int)(start - end)) > 0); > > @@ -482,16 +505,16 @@ static void call_console_drivers(unsigned start, unsigned end) > start_print = start; > while (cur_index != end) { > if (msg_level < 0 && ((end - cur_index) > 2) && > - LOG_BUF(cur_index + 0) == '<' && > - LOG_BUF(cur_index + 1) >= '0' && > - LOG_BUF(cur_index + 1) <= '7' && > - LOG_BUF(cur_index + 2) == '>') { > - msg_level = LOG_BUF(cur_index + 1) - '0'; > + LOG_BUF(ns, cur_index + 0) == '<' && > + LOG_BUF(ns, cur_index + 1) >= '0' && > + LOG_BUF(ns, cur_index + 1) <= '7' && > + LOG_BUF(ns, cur_index + 2) == '>') { > + msg_level = LOG_BUF(ns, cur_index + 1) - '0'; > cur_index += 3; > start_print = cur_index; > } > while (cur_index != end) { > - char c = LOG_BUF(cur_index); > + char c = LOG_BUF(ns, cur_index); > > cur_index++; > if (c == '\n') { > @@ -514,16 +537,26 @@ static void call_console_drivers(unsigned start, unsigned end) > _call_console_drivers(start_print, end, msg_level); > } > > +static void do_emit_log_char(struct syslog_ns *ns, char c) > +{ > + LOG_BUF(ns, ns->end) = c; > + ns->end++; > + if (ns->end - ns->start > ns->buf_len) > + ns->start = ns->end - ns->buf_len; > + if (ns == &init_syslog_ns) { > + if (g_log_end - con_start > g_log_buf_len) > + con_start = g_log_end - g_log_buf_len; > + } > + if (ns->logged_chars < ns->buf_len) > + ns->logged_chars++; > +} > + > static void emit_log_char(char c) > { > - LOG_BUF(log_end) = c; > - log_end++; > - if (log_end - log_start > log_buf_len) > - log_start = log_end - log_buf_len; > - if (log_end - con_start > log_buf_len) > - con_start = log_end - log_buf_len; > - if (logged_chars < log_buf_len) > - logged_chars++; > + struct syslog_ns *ns = current_user_ns()->syslog; > + if (ns != &init_syslog_ns) > + do_emit_log_char(ns,c); > + do_emit_log_char(&init_syslog_ns, c); > } > > /* > @@ -669,6 +702,25 @@ static inline void printk_delay(void) > } > } > > +/* called from create_user_ns() */ > +struct syslog_ns * do_syslog_init(void) > +{ > + struct syslog_ns *ns; > + > + ns = kzalloc(sizeof(*ns), GFP_KERNEL); > + if (!ns) > + return ERR_PTR(-ENOMEM); > + ns->buf = kzalloc(CONTAINER_BUF_LEN, GFP_KERNEL); > + if (!ns->buf) { > + kfree(ns); > + return ERR_PTR(-ENOMEM); > + } > + init_waitqueue_head(&ns->wait); > + ns->buf_len = CONTAINER_BUF_LEN; > + > + return ns; > +} > + > asmlinkage int vprintk(const char *fmt, va_list args) > { > int printed_len = 0; > @@ -676,6 +728,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) > unsigned long flags; > int this_cpu; > char *p; > + struct syslog_ns *syslog_ns; > > boot_delay_msec(); > printk_delay(); > @@ -741,7 +794,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) > } > > /* > - * Copy the output into log_buf. If the caller didn't provide > + * Copy the output into g_log_buf. If the caller didn't provide > * appropriate log level tags, we insert them here > */ > for ( ; *p; p++) { > @@ -790,7 +843,13 @@ asmlinkage int vprintk(const char *fmt, va_list args) > * will release 'logbuf_lock' regardless of whether it > * actually gets the semaphore or not. > */ > - if (acquire_console_semaphore_for_printk(this_cpu)) > + syslog_ns = current_user_ns()->syslog; > + if (syslog_ns != &init_syslog_ns) { > + int need_wake = (syslog_ns->start != syslog_ns->end); > + spin_unlock_irqrestore(&logbuf_lock, flags); > + if (!oops_in_progress && need_wake) > + wake_up_interruptible(&syslog_ns->wait); > + } else if (acquire_console_semaphore_for_printk(this_cpu)) > release_console_sem(); > > lockdep_on(); > @@ -811,6 +870,14 @@ static void call_console_drivers(unsigned start, unsigned end) > > #endif > > +/* init_syslog_ns is part of init_user_ns */ > +/* note this does not work for !CONFIG_PRINTK */ > +struct syslog_ns init_syslog_ns = { > + .wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_syslog_ns.wait), > + .buf_len = __LOG_BUF_LEN, > + .buf = __log_buf, > +}; > + > static int __add_preferred_console(char *name, int idx, char *options, > char *brl_options) > { > @@ -1010,7 +1077,7 @@ void printk_tick(void) > { > if (__get_cpu_var(printk_pending)) { > __get_cpu_var(printk_pending) = 0; > - wake_up_interruptible(&log_wait); > + wake_up_interruptible(&g_log_wait); > } > } > > @@ -1021,7 +1088,7 @@ int printk_needs_cpu(int cpu) > > void wake_up_klogd(void) > { > - if (waitqueue_active(&log_wait)) > + if (waitqueue_active(&g_log_wait)) > __raw_get_cpu_var(printk_pending) = 1; > } > > @@ -1054,12 +1121,12 @@ void release_console_sem(void) > > for ( ; ; ) { > spin_lock_irqsave(&logbuf_lock, flags); > - wake_klogd |= log_start - log_end; > - if (con_start == log_end) > + wake_klogd |= g_log_start - g_log_end; > + if (con_start == g_log_end) > break; /* Nothing to print */ > _con_start = con_start; > - _log_end = log_end; > - con_start = log_end; /* Flush */ > + _log_end = g_log_end; > + con_start = g_log_end; /* Flush */ > spin_unlock(&logbuf_lock); > stop_critical_timings(); /* don't trace print latency */ > call_console_drivers(_con_start, _log_end); > @@ -1287,7 +1354,7 @@ void register_console(struct console *newcon) > * for us. > */ > spin_lock_irqsave(&logbuf_lock, flags); > - con_start = log_start; > + con_start = g_log_start; > spin_unlock_irqrestore(&logbuf_lock, flags); > } > release_console_sem(); > @@ -1498,22 +1565,22 @@ void kmsg_dump(enum kmsg_dump_reason reason) > there's not a lot we can do about that. The new messages > will overwrite the start of what we dump. */ > spin_lock_irqsave(&logbuf_lock, flags); > - end = log_end & LOG_BUF_MASK; > - chars = logged_chars; > + end = g_log_end & LOG_BUF_MASK(&init_syslog_ns); > + chars = g_logged_chars; > spin_unlock_irqrestore(&logbuf_lock, flags); > > - if (logged_chars > end) { > - s1 = log_buf + log_buf_len - logged_chars + end; > - l1 = logged_chars - end; > + if (g_logged_chars > end) { > + s1 = g_log_buf + g_log_buf_len - g_logged_chars + end; > + l1 = g_logged_chars - end; > > - s2 = log_buf; > + s2 = g_log_buf; > l2 = end; > } else { > s1 = ""; > l1 = 0; > > - s2 = log_buf + end - logged_chars; > - l2 = logged_chars; > + s2 = g_log_buf + end - g_logged_chars; > + l2 = g_logged_chars; > } > > if (!spin_trylock_irqsave(&dump_list_lock, flags)) { > diff --git a/kernel/user.c b/kernel/user.c > index 46d0165..102c2ce 100644 > --- a/kernel/user.c > +++ b/kernel/user.c > @@ -18,11 +18,15 @@ > #include <linux/user_namespace.h> > #include "cred-internals.h" > > +/* defined in kernel/printk.c */ > +extern struct syslog_ns init_syslog_ns; > + > struct user_namespace init_user_ns = { > .kref = { > .refcount = ATOMIC_INIT(2), > }, > .creator = &root_user, > + .syslog = &init_syslog_ns, > }; > EXPORT_SYMBOL_GPL(init_user_ns); > > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c > index 076c7c8..43d46d1 100644 > --- a/kernel/user_namespace.c > +++ b/kernel/user_namespace.c > @@ -11,6 +11,9 @@ > #include <linux/user_namespace.h> > #include <linux/cred.h> > > +/* defined in kernel/printk.c */ > +extern struct syslog_ns *do_syslog_init(void); > + > /* > * Create a new user namespace, deriving the creator from the user in the > * passed credentials, and replacing that user with the new root user for the > @@ -34,9 +37,17 @@ int create_user_ns(struct cred *new) > for (n = 0; n < UIDHASH_SZ; ++n) > INIT_HLIST_HEAD(ns->uidhash_table + n); > > + ns->syslog = do_syslog_init(); > + if (!ns->syslog) { > + kfree(ns); > + return -ENOMEM; > + } > + > /* Alloc new root user. */ > root_user = alloc_uid(ns, 0); > if (!root_user) { > + kfree(ns->syslog->buf); > + kfree(ns->syslog); > kfree(ns); > return -ENOMEM; > } > @@ -70,6 +81,8 @@ static void free_user_ns_work(struct work_struct *work) > struct user_namespace *ns = > container_of(work, struct user_namespace, destroyer); > free_uid(ns->creator); > + kfree(ns->syslog->buf); > + kfree(ns->syslog); > kfree(ns); > } > > -- > 1.6.1 > > _______________________________________________ > Containers mailing list > Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx > https://lists.linux-foundation.org/mailman/listinfo/containers _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers