2017-06-29 22:24 GMT+03:00 Michael Holzheu <holzheu@xxxxxxxxxxxxxxxxxx>: > This patch only adds the support but does not enable it for building. > > Reviewed-by: Alice Frosi <alice@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Michael Holzheu <holzheu@xxxxxxxxxxxxxxxxxx> LGTM, and looking at diffstat - shouldn't break anything we already have, Reviewed-by: Dmitry Safonov <dsafonov@xxxxxxxxxxxxx> > --- ... > diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S b/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S > new file mode 100644 > index 0000000..79e3b8e > --- /dev/null > +++ b/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S > @@ -0,0 +1,37 @@ > +#include "common/asm/linkage.h" > + > +/* > + * Define a system call > + * > + * C-ABI on s390: > + * - Parameters 1-5 are passed in %r2-%r6 > + * - Parameter 6 is passed on the stack 160(%r15) > + * - Return value is in %r2 > + * - Return address is in %r14 > + * - Registers %r0-%r6,%r14 are call-clobbered > + * - Registers %r7-%r13,%r15 are call-saved > + * > + * SVC ABI on s390: > + * - For SVC 0 the system call number is passed in %r1 > + * - Parameters 1-6 are passed in %r2-%r7 > + * - Return value is passed in %r2 > + * - Besides of %r2 all registers are call-saved > + */ I like the comment :) > +#define SYSCALL(name, opcode) \ > +ENTRY(name); \ > + lgr %r0,%r7; /* Save %r7 */ \ > + lg %r7,160(%r15); /* Load 6th parameter */ \ > + lghi %r1,opcode; /* Load SVC number */ \ > + svc 0; /* Issue SVC 0 */ \ > + lgr %r7,%r0; /* Restore %r7 */ \ > + br %r14; /* Return to caller */ \ > +END(name) \ > + > +/* > + * Issue rt_sigreturn system call for sa_restorer > + */ > +ENTRY(__cr_restore_rt) > + lghi %r1,__NR_rt_sigreturn > + svc 0 > +END(__cr_restore_rt) > + ... > +/* > + * Issue s390 mmap call > + */ > +void *remote_mmap(struct parasite_ctl *ctl, > + void *addr, size_t length, int prot, > + int flags, int fd, off_t offset) > +{ > + void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; > + struct mmap_arg_struct arg_struct; > + pid_t pid = ctl->rpid; > + long map = 0; > + int err; > + > + /* Setup s390 mmap data */ > + arg_struct.addr = (unsigned long)addr; > + arg_struct.len = length; > + arg_struct.prot = prot; > + arg_struct.flags = flags; > + arg_struct.fd = fd; > + arg_struct.offset = offset; > + > + /* Move args to process */ > + if (ptrace_swap_area(pid, where, &arg_struct, sizeof(arg_struct))) { > + pr_err("Can't inject memfd args (pid: %d)\n", pid); > + return NULL; > + } > + > + /* Do syscall */ > + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long) where, > + 0, 0, 0, 0, 0); > + if (err < 0 || (long)map < 0) > + map = 0; > + > + /* Restore data */ > + if (ptrace_poke_area(pid, &arg_struct, where, sizeof(arg_struct))) { > + pr_err("Can't restore mmap args (pid: %d)\n", pid); > + if (map != 0) { > + err = compel_syscall(ctl, __NR_munmap, NULL, map, > + length, 0, 0, 0, 0); Well, we could avoid setting err here, as we can't handle it anyway - we're so fucked up here %) But that's not significant. ... > +/* > + * Kernel task size level > + * > + * We have (dynamic) 4 level page tables for 64 bit since linux 2.6.25: > + * > + * 5a216a2083 ("[S390] Add four level page tables for CONFIG_64BIT=y.") > + * 6252d702c5 ("[S390] dynamic page tables.") > + * > + * The code below is already prepared for future (dynamic) 5 level page tables. > + * > + * Besides that there is one problematic kernel bug that has been fixed for > + * linux 4.11 by the following commit: > + * > + * ee71d16d22 ("s390/mm: make TASK_SIZE independent from the number > + * of page table levels") > + * > + * A 64 bit process on s390x always starts with 3 levels and upgrades to 4 > + * levels for mmap(> 4 TB) and to 5 levels for mmap(> 16 EB). > + * > + * Unfortunately before fix ee71d16d22 for a 3 level process munmap() > + * and mremap() fail for addresses > 4 TB. CRIU uses the task size, > + * to unmap() all memory from a starting point to task size to get rid of > + * unwanted mappings. CRIU uses mremap() to establish the final mappings > + * which also fails if we want to restore mappings > 4 TB and the initial > + * restore process still runs with 3 levels. > + * > + * To support the current CRIU design on s390 we return task size = 4 TB when > + * a kernel without fix ee71d16d22 is detected. In this case we can dump at > + * least processes with < 4 TB which is the most likely case anyway. > + * > + * For kernels with fix ee71d16d22 we are fully functional. > + */ > +enum kernel_ts_level { > + /* Kernel with 4 level page tables without fix ee71d16d22 */ > + KERNEL_TS_LEVEL_4_FIX_NO, > + /* Kernel with 4 level page tables with fix ee71d16d22 */ > + KERNEL_TS_LEVEL_4_FIX_YES, > + /* Kernel with 4 level page tables with or without fix ee71d16d22 */ > + KERNEL_TS_LEVEL_4_FIX_UNKN, > + /* Kernel with 5 level page tables */ > + KERNEL_TS_LEVEL_5, > +}; > + > +/* See arch/s390/include/asm/processor.h */ > +#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ > +#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ > +#define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ > + > +/* > + * Return detected kernel version regarding task size level > + * > + * We use unmap() to probe the maximum possible page table level of kernel > + */ > +static enum kernel_ts_level get_kernel_ts_level(void) > +{ > + unsigned long criu_end_addr = max_mapped_addr(); > + > + /* Check for 5 levels */ > + if (criu_end_addr > TASK_SIZE_LEVEL_4) > + return KERNEL_TS_LEVEL_5; > + else if (munmap((void *) TASK_SIZE_LEVEL_4, 0x1000) == 0) Can't we occasionally unmap something that's already mapped? Is it restricted by kernel to mmap() on such high address? > + return KERNEL_TS_LEVEL_5; > + > + if (criu_end_addr < TASK_SIZE_LEVEL_3) { > + /* Check for 4 level kernel with fix */ > + if (munmap((void *) TASK_SIZE_LEVEL_3, 0x1000) == 0) > + return KERNEL_TS_LEVEL_4_FIX_YES; > + else > + return KERNEL_TS_LEVEL_4_FIX_NO; > + } > + /* We can't find out if kernel has the fix */ > + return KERNEL_TS_LEVEL_4_FIX_UNKN; > +} > + > +/* -- Dmitry -- To unsubscribe from this list: send the line "unsubscribe linux-s390" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html