On mån, 2015-05-18 at 16:39 +0200, Alexander Larsson wrote: Didn't get any replies to the below kernel panic (testcase attached), which seems rather important to fix. Reposting to a wider audience. > If I build and run the attached break-kernel.c as a user i get this > kernel panic on the fedora 4.0.3 kernel: > > maj 18 16:33:36 nano kernel: BUG: unable to handle kernel NULL pointer dereference at (null) > maj 18 16:33:36 nano kernel: IP: [<ffffffff81250288>] pin_remove+0x58/0xc0 > maj 18 16:33:36 nano kernel: PGD 1cc973067 PUD 1d727b067 PMD 0 > maj 18 16:33:36 nano kernel: Oops: 0002 [#1] SMP > maj 18 16:33:36 nano kernel: Modules linked in: rfcomm fuse ccm xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 nf_conntrack_netbios_ns nf_conntrack_broadcast ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw bnep arc4 intel_rapl iosf_mbi x86_pkg_temp_thermal coretemp kvm iwlmvm snd_hda_codec_realtek mac80211 snd_hda_codec_hdmi snd_hda_codec_generic vfat fat iTCO_wdt iTCO_vendor_support snd_hda_intel snd_hda_controller snd_hda_codec crct10dif_pclmul snd_hwdep crc32_pclmul snd_seq iwlwifi crc32c_intel > maj 18 16:33:36 nano kernel: snd_seq_device uvcvideo ghash_clmulni_intel videobuf2_vmalloc snd_pcm videobuf2_core cfg80211 videobuf2_memops v4l2_common videodev thinkpad_acpi snd_timer serio_raw btusb media hid_multitouch bluetooth snd lpc_ich mfd_core i2c_i801 mei_me cdc_acm tpm_tis shpchp mei tpm soundcore wmi rfkill i2c_designware_platform i2c_designware_core nfsd auth_rpcgss nfs_acl lockd grace sunrpc cdc_mbim cdc_wdm cdc_ncm usbnet mii i915 i2c_algo_bit drm_kms_helper e1000e drm ptp pps_core video > maj 18 16:33:36 nano kernel: CPU: 2 PID: 2662 Comm: break-kernel Not tainted 4.0.3-201.fc21.x86_64 #1 > maj 18 16:33:36 nano kernel: Hardware name: LENOVO 20A7005RUK/20A7005RUK, BIOS GRET42WW (1.19 ) 11/20/2014 > maj 18 16:33:36 nano kernel: task: ffff8800a1a893e0 ti: ffff8801cafb4000 task.ti: ffff8801cafb4000 > maj 18 16:33:36 nano kernel: RIP: 0010:[<ffffffff81250288>] [<ffffffff81250288>] pin_remove+0x58/0xc0 > maj 18 16:33:36 nano kernel: RSP: 0018:ffff8801cafb7e08 EFLAGS: 00010246 > maj 18 16:33:36 nano kernel: RAX: 0000000000000000 RBX: ffff880212b09f20 RCX: 000000000000011a > maj 18 16:33:36 nano kernel: RDX: 0000000000000000 RSI: 0000000000000005 RDI: ffffffff82004a70 > maj 18 16:33:36 nano kernel: RBP: ffff8801cafb7e18 R08: ffffffff81d25540 R09: ffff8800a6f73a28 > maj 18 16:33:36 nano kernel: R10: 0000000000000000 R11: 0000000000000206 R12: ffff8801cafb7e70 > maj 18 16:33:36 nano kernel: R13: ffff8800a1a893e0 R14: ffff8800a1a893e0 R15: 0000000000000000 > maj 18 16:33:36 nano kernel: FS: 00007fab3d3fa700(0000) GS:ffff88021e280000(0000) knlGS:0000000000000000 > maj 18 16:33:36 nano kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > maj 18 16:33:36 nano kernel: CR2: 0000000000000000 CR3: 00000001d70b6000 CR4: 00000000001407e0 > maj 18 16:33:36 nano kernel: Stack: > maj 18 16:33:36 nano kernel: ffff8800a1a893e0 ffff880212b09f20 ffff8801cafb7e38 ffffffff8123d7c2 > maj 18 16:33:36 nano kernel: ffff8801cafb7e20 ffff880212b09f20 ffff8801cafb7ea8 ffffffff81250414 > maj 18 16:33:36 nano kernel: ffff880212b08da0 ffff88003f41b000 ffff880100000000 ffff8800a1a893e0 > maj 18 16:33:36 nano kernel: Call Trace: > maj 18 16:33:36 nano kernel: [<ffffffff8123d7c2>] drop_mountpoint+0x22/0x40 > maj 18 16:33:36 nano kernel: [<ffffffff81250414>] pin_kill+0x74/0x100 > maj 18 16:33:36 nano kernel: [<ffffffff810dfbb0>] ? wait_woken+0x90/0x90 > maj 18 16:33:36 nano kernel: [<ffffffff812504c9>] mnt_pin_kill+0x29/0x40 > maj 18 16:33:36 nano kernel: [<ffffffff8123cbe0>] cleanup_mnt+0x90/0xa0 > maj 18 16:33:36 nano kernel: [<ffffffff8123cc42>] __cleanup_mnt+0x12/0x20 > maj 18 16:33:36 nano kernel: [<ffffffff810ba607>] task_work_run+0xb7/0xf0 > maj 18 16:33:36 nano kernel: [<ffffffff81014cdd>] do_notify_resume+0x8d/0xa0 > maj 18 16:33:36 nano kernel: [<ffffffff817835e3>] int_signal+0x12/0x17 > maj 18 16:33:36 nano kernel: Code: 48 89 50 08 48 b8 00 01 10 00 00 00 ad de 48 8b 53 28 48 89 43 30 48 b8 00 02 20 00 00 00 ad de 48 89 43 38 48 8b 43 20 48 85 c0 <48> 89 02 74 04 48 89 50 08 48 b8 00 01 10 00 00 00 ad de 48 89 > maj 18 16:33:36 nano kernel: RIP [<ffffffff81250288>] pin_remove+0x58/0xc0 > maj 18 16:33:36 nano kernel: RSP <ffff8801cafb7e08> > maj 18 16:33:36 nano kernel: CR2: 0000000000000000 > maj 18 16:33:36 nano kernel: ---[ end trace e025319273fa36f8 ]--- > > I get no such crash with the previous (3.19.7) kernel. > > _______________________________________________ > Containers mailing list > Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx > https://lists.linuxfoundation.org/mailman/listinfo/containers -- =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Alexander Larsson Red Hat, Inc alexl@xxxxxxxxxx alexander.larsson@xxxxxxxxx He's a hate-fuelled arachnophobic jungle king possessed of the uncanny powers of an insect. She's a mentally unstable snooty vampire from the wrong side of the tracks. They fight crime!
#define _GNU_SOURCE /* Required for CLONE_NEWNS */ #include <assert.h> #include <arpa/inet.h> #include <dirent.h> #include <errno.h> #include <fcntl.h> #include <getopt.h> #include <linux/loop.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> #include <net/if.h> #include <netinet/in.h> #include <sched.h> #include <signal.h> #include <poll.h> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/mount.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/syscall.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/eventfd.h> #include <sys/signalfd.h> #include <sys/capability.h> #include <sys/prctl.h> #include <unistd.h> #if 0 #define __debug__(x) printf x #else #define __debug__(x) #endif #define N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) #define TRUE 1 #define FALSE 0 typedef int bool; #define READ_END 0 #define WRITE_END 1 static void die_with_error (const char *format, ...) { va_list args; int errsv; errsv = errno; va_start (args, format); vfprintf (stderr, format, args); va_end (args); fprintf (stderr, ": %s\n", strerror (errsv)); exit (1); } static void die (const char *format, ...) { va_list args; va_start (args, format); vfprintf (stderr, format, args); va_end (args); fprintf (stderr, "\n"); exit (1); } static void * xmalloc (size_t size) { void *res = malloc (size); if (res == NULL) die ("oom"); return res; } char * strconcat (const char *s1, const char *s2) { size_t len = 0; char *res; if (s1) len += strlen (s1); if (s2) len += strlen (s2); res = xmalloc (len + 1); *res = 0; if (s1) strcat (res, s1); if (s2) strcat (res, s2); return res; } char * strconcat3 (const char *s1, const char *s2, const char *s3) { size_t len = 0; char *res; if (s1) len += strlen (s1); if (s2) len += strlen (s2); if (s3) len += strlen (s3); res = xmalloc (len + 1); *res = 0; if (s1) strcat (res, s1); if (s2) strcat (res, s2); if (s3) strcat (res, s3); return res; } char * strconcat_len (const char *s1, const char *s2, size_t s2_len) { size_t len = 0; char *res; if (s1) len += strlen (s1); if (s2) len += s2_len; res = xmalloc (len + 1); *res = 0; if (s1) strcat (res, s1); if (s2) strncat (res, s2, s2_len); return res; } char* strdup_printf (const char *format, ...) { char *buffer = NULL; va_list args; va_start (args, format); vasprintf (&buffer, format, args); va_end (args); if (buffer == NULL) die ("oom"); return buffer; } static inline int raw_clone(unsigned long flags, void *child_stack) { #if defined(__s390__) || defined(__CRIS__) /* On s390 and cris the order of the first and second arguments * of the raw clone() system call is reversed. */ return (int) syscall(__NR_clone, child_stack, flags); #else return (int) syscall(__NR_clone, flags, child_stack); #endif } static int pivot_root (const char * new_root, const char * put_old) { #ifdef __NR_pivot_root return syscall(__NR_pivot_root, new_root, put_old); #else errno = ENOSYS; return -1; #endif } typedef enum { FILE_TYPE_REGULAR, FILE_TYPE_DIR, FILE_TYPE_SYMLINK, FILE_TYPE_SYSTEM_SYMLINK, FILE_TYPE_BIND, FILE_TYPE_BIND_RO, FILE_TYPE_MOUNT, FILE_TYPE_REMOUNT, FILE_TYPE_DEVICE, } file_type_t; typedef enum { FILE_FLAGS_NONE = 0, FILE_FLAGS_NON_FATAL = 1 << 0, FILE_FLAGS_IF_LAST_FAILED = 1 << 1, FILE_FLAGS_DEVICES = 1 << 2, FILE_FLAGS_NOREMOUNT = 1 << 3, } file_flags_t; typedef struct { file_type_t type; const char *name; mode_t mode; const char *data; file_flags_t flags; int *option; } create_table_t; typedef struct { const char *what; const char *where; const char *type; const char *options; unsigned long flags; } mount_table_t; int ascii_isdigit (char c) { return c >= '0' && c <= '9'; } static const create_table_t create[] = { { FILE_TYPE_DIR, ".oldroot", 0755 }, { FILE_TYPE_DIR, "usr", 0755 }, { FILE_TYPE_BIND_RO, "usr", 0755, "/usr"}, { FILE_TYPE_DIR, "tmp", 01777 }, { FILE_TYPE_DIR, "run", 0755}, { FILE_TYPE_DIR, "var", 0755}, { FILE_TYPE_SYMLINK, "var/tmp", 0755, "/tmp"}, { FILE_TYPE_SYMLINK, "var/run", 0755, "/run"}, { FILE_TYPE_SYSTEM_SYMLINK, "lib32", 0755, "usr/lib32"}, { FILE_TYPE_SYSTEM_SYMLINK, "lib64", 0755, "usr/lib64"}, { FILE_TYPE_SYSTEM_SYMLINK, "lib", 0755, "usr/lib"}, { FILE_TYPE_SYSTEM_SYMLINK, "bin", 0755, "usr/bin" }, { FILE_TYPE_SYSTEM_SYMLINK, "sbin", 0755, "usr/sbin"}, { FILE_TYPE_DIR, "etc", 0755, NULL, 0}, { FILE_TYPE_BIND, "etc", 0755, "/etc", FILE_FLAGS_NOREMOUNT}, { FILE_TYPE_DIR, "proc", 0755}, { FILE_TYPE_MOUNT, "proc"}, { FILE_TYPE_DIR, "dev", 0755}, { FILE_TYPE_MOUNT, "dev"}, { FILE_TYPE_DIR, "dev/shm", 0755}, { FILE_TYPE_MOUNT, "dev/shm"}, { FILE_TYPE_DEVICE, "dev/null", 0666}, { FILE_TYPE_DEVICE, "dev/zero", 0666}, { FILE_TYPE_DEVICE, "dev/full", 0666}, { FILE_TYPE_DEVICE, "dev/random", 0666}, { FILE_TYPE_DEVICE, "dev/urandom", 0666}, { FILE_TYPE_DEVICE, "dev/tty", 0666}, }; static mount_table_t mount_table[] = { { "proc", "proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV }, { "tmpfs", "dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME }, { "tmpfs", "dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME }, }; typedef enum { BIND_READONLY = (1<<0), BIND_PRIVATE = (1<<1), BIND_DEVICES = (1<<2), BIND_RECURSIVE = (1<<3), BIND_NOREMOUNT = (1<<4), } bind_option_t; static int bind_mount (const char *src, const char *dest, bind_option_t options) { bool readonly = (options & BIND_READONLY) != 0; bool private = (options & BIND_PRIVATE) != 0; bool devices = (options & BIND_DEVICES) != 0; bool noremount = (options & BIND_NOREMOUNT) != 0; bool recursive = (options & BIND_RECURSIVE) != 0; if (mount (src, dest, NULL, MS_MGC_VAL|MS_BIND|(recursive?MS_REC:0), NULL) != 0) return 1; if (private) { if (mount ("none", dest, NULL, MS_REC|MS_PRIVATE, NULL) != 0) return 2; } if (!noremount) { if (mount ("none", dest, NULL, MS_MGC_VAL|MS_BIND|MS_REMOUNT|(devices?0:MS_NODEV)|MS_NOSUID|(readonly?MS_RDONLY:0), NULL) != 0) return 3; } return 0; } static int write_to_file (int fd, const char *content) { ssize_t len = strlen (content); ssize_t res; while (len > 0) { res = write (fd, content, len); if (res < 0 && errno == EINTR) continue; if (res <= 0) return -1; len -= res; content += res; } return 0; } static int write_file (const char *path, const char *content) { int fd; int res; fd = open (path, O_RDWR | O_CLOEXEC, 0); if (fd == -1) return -1; res = 0; if (content) res = write_to_file (fd, content); close (fd); return res; } static int create_file (const char *path, mode_t mode, const char *content) { int fd; int res; fd = creat (path, mode); if (fd == -1) return -1; res = 0; if (content) res = write_to_file (fd, content); close (fd); return res; } static void create_files (const create_table_t *create, int n_create) { bool last_failed = FALSE; int i; for (i = 0; i < n_create; i++) { char *name; char *data = NULL; mode_t mode = create[i].mode; file_flags_t flags = create[i].flags; int *option = create[i].option; char *in_root; int k; bool found; int res; if ((flags & FILE_FLAGS_IF_LAST_FAILED) && !last_failed) continue; if (option && !*option) continue; name = strdup_printf (create[i].name, getuid()); if (create[i].data) data = strdup_printf (create[i].data, getuid()); last_failed = FALSE; switch (create[i].type) { case FILE_TYPE_DIR: if (mkdir (name, mode) != 0) die_with_error ("creating dir %s", name); break; case FILE_TYPE_REGULAR: if (create_file (name, mode, NULL)) die_with_error ("creating file %s", name); break; case FILE_TYPE_SYSTEM_SYMLINK: { struct stat buf; in_root = strconcat ("/", name); if (stat (in_root, &buf) == 0) { if (mkdir (name, mode) != 0) die_with_error ("creating dir %s", name); if (bind_mount (in_root, name, BIND_PRIVATE | BIND_READONLY)) die_with_error ("mount %s", name); } free (in_root); break; } case FILE_TYPE_SYMLINK: if (symlink (data, name) != 0) die_with_error ("creating symlink %s", name); break; case FILE_TYPE_BIND: case FILE_TYPE_BIND_RO: if ((res = bind_mount (data, name, 0 | ((create[i].type == FILE_TYPE_BIND_RO) ? BIND_READONLY : 0) | ((flags & FILE_FLAGS_DEVICES) ? BIND_DEVICES : 0) | ((flags & FILE_FLAGS_NOREMOUNT) ? BIND_NOREMOUNT : 0) ))) { if (res > 1 || (flags & FILE_FLAGS_NON_FATAL) == 0) die_with_error ("mounting bindmount %s", name); last_failed = TRUE; } break; case FILE_TYPE_MOUNT: found = FALSE; for (k = 0; k < N_ELEMENTS(mount_table); k++) { if (strcmp (mount_table[k].where, name) == 0) { if (mount(mount_table[k].what, mount_table[k].where, mount_table[k].type, mount_table[k].flags, mount_table[k].options) < 0) die_with_error ("Mounting %s", name); found = TRUE; } } if (!found) die ("Unable to find mount %s\n", name); break; case FILE_TYPE_REMOUNT: if (mount ("none", name, NULL, MS_MGC_VAL|MS_REMOUNT|mode, NULL) != 0) die_with_error ("Unable to remount %s\n", name); break; case FILE_TYPE_DEVICE: if (create_file (name, mode, NULL)) die_with_error ("creating file %s", name); in_root = strconcat ("/", name); if ((res = bind_mount (in_root, name, BIND_DEVICES))) { if (res > 1 || (flags & FILE_FLAGS_NON_FATAL) == 0) die_with_error ("binding device %s", name); } free (in_root); break; default: die ("Unknown create type %d\n", create[i].type); } free (name); free (data); } } int main (int argc, char **argv) { mode_t old_umask; char *newroot; char *args[] = {"sh", NULL}; char old_cwd[256]; char *uid_map, *gid_map; int uid, gid; pid_t pid; newroot = "/tmp/.xdg-app-root"; if (mkdir (newroot, 0755) && errno != EEXIST) die_with_error ("Creating xdg-app-root failed"); __debug__(("creating new namespace\n")); uid = getuid (); gid = getgid (); pid = raw_clone (SIGCHLD | CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER, NULL); if (pid == -1) die_with_error ("Creating new namespace failed"); if (pid != 0) exit (0); uid_map = strdup_printf ("%d %d 1\n", uid, uid); if (write_file ("/proc/self/uid_map", uid_map) < 0) die_with_error ("setting up uid map"); free (uid_map); if (write_file("/proc/self/setgroups", "deny\n") < 0) die_with_error ("error writing to setgroups"); gid_map = strdup_printf ("%d %d 1\n", gid, gid); if (write_file ("/proc/self/gid_map", gid_map) < 0) die_with_error ("setting up gid map"); free (gid_map); old_umask = umask (0); if (mount (NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) die_with_error ("Failed to make / slave"); if (mount ("", newroot, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL) != 0) die_with_error ("Failed to mount tmpfs"); getcwd (old_cwd, sizeof (old_cwd)); if (chdir (newroot) != 0) die_with_error ("chdir"); create_files (create, N_ELEMENTS (create)); if (pivot_root (newroot, ".oldroot")) die_with_error ("pivot_root"); chdir ("/"); /* The old root better be rprivate or we will send unmount events to the parent namespace */ if (mount (".oldroot", ".oldroot", NULL, MS_REC|MS_PRIVATE, NULL) != 0) die_with_error ("Failed to make old root rprivate"); if (umount2 (".oldroot", MNT_DETACH)) die_with_error ("unmount oldroot"); umask (old_umask); chdir (old_cwd); __debug__(("forking for child\n")); if (execvp (args[0], args) == -1) die_with_error ("execvp %s", args[0]); return 0; }
_______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers