On Fri, 2 Jun 2017 09:02:10 +0200 Heiko Carstens <heiko.carstens@xxxxxxxxxx> wrote: > On Thu, Jun 01, 2017 at 12:46:51PM +0200, Martin Schwidefsky wrote: > > > Unfortunately, converting all page tables to 4k pgste page tables is > > > not possible without provoking various race conditions. > > > > That is one approach we tried and was found to be buggy. The point is that > > you are not allowed to reallocate a page table while a VMA exists that is > > in the address range of that page table. > > > > Another approach we tried is to use an ELF flag on the qemu executable. > > That does not work either because fs/exec.c allocates and populates the > > new mm struct for the argument pages before fs/binfmt_elf.c comes into > > play. > > How about if you would fail the system call within arch_check_elf() if you > detect that the binary requires pgstes (as indicated by elf flags) and then > restart the system call? > > That is: arch_check_elf() e.g. would set a thread flag that future mm's > should be allocated with pgstes. Then do_execve() would cleanup everything > and return to entry.S. Upon return to userspace we detect this condition > and simply restart the system call, similar to signals vs -ERESTARTSYS. > > That would make do_execve() cleanup everything and upon reentering it would > allocate an mm with the pgste flag set. > > Maybe this is a bit over-simplified, but might work. This is not over-simplified at all, that does work: -- diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 69a77eecaec1..7bd182676ddd 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES config S390 def_bool y + select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index e8f623041769..79911231f9e6 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -151,6 +151,28 @@ extern unsigned int vdso_enabled; && (x)->e_ident[EI_CLASS] == ELF_CLASS) #define compat_start_thread start_thread31 +struct arch_elf_state { +}; + +#define INIT_ARCH_ELF_STATE { } + +#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) (0) +#define arch_check_elf(ehdr, interp, interp_ehdr, state) \ +({ \ + struct elf64_hdr *hdr = (void*) ehdr; \ + int _rc = 0; \ + if (hdr->e_ident[EI_CLASS] == ELFCLASS64 && \ + (hdr->e_flags & 0x00000002) && \ + !page_table_allocate_pgste && \ + !current->mm->context.alloc_pgste) { \ + current->mm->context.alloc_pgste = 1; \ + set_pt_regs_flag(task_pt_regs(current), \ + PIF_SYSCALL_RESTART); \ + _rc = -EAGAIN; \ + } \ + _rc; \ +}) + /* For SVR4/S390 the function pointer to be registered with `atexit` is passed in R14. */ #define ELF_PLAT_INIT(_r, load_addr) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index c119d564d8f2..268a5d22ce1b 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -25,7 +25,8 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.gmap_asce = 0; mm->context.flush_mm = 0; #ifdef CONFIG_PGSTE - mm->context.alloc_pgste = page_table_allocate_pgste; + mm->context.alloc_pgste = page_table_allocate_pgste || + current->mm->context.alloc_pgste; mm->context.has_pgste = 0; mm->context.use_skey = 0; mm->context.use_cmma = 0; diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 99bc456cc26a..24baa80f7af6 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -11,9 +11,11 @@ #define PIF_SYSCALL 0 /* inside a system call */ #define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */ +#define PIF_SYSCALL_RESTART 2 /* restart the current system call */ #define _PIF_SYSCALL _BITUL(PIF_SYSCALL) #define _PIF_PER_TRAP _BITUL(PIF_PER_TRAP) +#define _PIF_SYSCALL_RESTART _BITUL(PIF_SYSCALL_RESTART) #ifndef __ASSEMBLY__ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 0c2c3b8bfc9a..8c824b32527a 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -52,7 +52,7 @@ _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ _TIF_SYSCALL_TRACEPOINT) _CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \ _CIF_ASCE_SECONDARY | _CIF_FPU) -_PIF_WORK = (_PIF_PER_TRAP) +_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) #define BASED(name) name-cleanup_critical(%r13) @@ -342,6 +342,8 @@ ENTRY(system_call) jo .Lsysc_guarded_storage TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP jo .Lsysc_singlestep + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART + jo .Lsysc_syscall_restart TSTMSK __TI_flags(%r12),_TIF_SIGPENDING jo .Lsysc_sigpending TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME @@ -434,6 +436,15 @@ ENTRY(system_call) jg do_per_trap # +# _PIF_SYSCALL_RESTART is set, repeat the current system call +# +.Lsysc_syscall_restart: + ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART + lmg %r1,%r7,__PT_R1(%r11) # load svc arguments + lg %r2,__PT_ORIG_GPR2(%r11) + j .Lsysc_do_svc + +# # call tracehook_report_syscall_entry/tracehook_report_syscall_exit before # and after the system call # -- blue skies, Martin. "Reality continues to ruin my life." - Calvin.