Re: [PATCH 3.18 04/84] x86_64, switch_to(): Load TLS descriptors before switching DS and ES

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 01/07/2015, 02:49 AM, Greg Kroah-Hartman wrote:
> 3.18-stable review patch.  If anyone has any objections, please let me know.

Greg, Andi raised an objection against this one for 3.12 which still
holds here and for other trees:
https://www.mail-archive.com/stable@xxxxxxxxxxxxxxx/msg106471.html

Quoting:
On 01/06/2015, 07:53 PM, Andi Kleen wrote:
> IMHO this is not stable material. Significant risk you broke
> something obscure, and it's not clear it fixes any real problem.
>
> At least wait some more time first.

Thanks.

> ------------------
> 
> From: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> 
> commit f647d7c155f069c1a068030255c300663516420e upstream.
> 
> Otherwise, if buggy user code points DS or ES into the TLS
> array, they would be corrupted after a context switch.
> 
> This also significantly improves the comments and documents some
> gotchas in the code.
> 
> Before this patch, the both tests below failed.  With this
> patch, the es test passes, although the gsbase test still fails.
> 
>  ----- begin es test -----
> 
> /*
>  * Copyright (c) 2014 Andy Lutomirski
>  * GPL v2
>  */
> 
> static unsigned short GDT3(int idx)
> {
> 	return (idx << 3) | 3;
> }
> 
> static int create_tls(int idx, unsigned int base)
> {
> 	struct user_desc desc = {
> 		.entry_number    = idx,
> 		.base_addr       = base,
> 		.limit           = 0xfffff,
> 		.seg_32bit       = 1,
> 		.contents        = 0, /* Data, grow-up */
> 		.read_exec_only  = 0,
> 		.limit_in_pages  = 1,
> 		.seg_not_present = 0,
> 		.useable         = 0,
> 	};
> 
> 	if (syscall(SYS_set_thread_area, &desc) != 0)
> 		err(1, "set_thread_area");
> 
> 	return desc.entry_number;
> }
> 
> int main()
> {
> 	int idx = create_tls(-1, 0);
> 	printf("Allocated GDT index %d\n", idx);
> 
> 	unsigned short orig_es;
> 	asm volatile ("mov %%es,%0" : "=rm" (orig_es));
> 
> 	int errors = 0;
> 	int total = 1000;
> 	for (int i = 0; i < total; i++) {
> 		asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx)));
> 		usleep(100);
> 
> 		unsigned short es;
> 		asm volatile ("mov %%es,%0" : "=rm" (es));
> 		asm volatile ("mov %0,%%es" : : "rm" (orig_es));
> 		if (es != GDT3(idx)) {
> 			if (errors == 0)
> 				printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n",
> 				       GDT3(idx), es);
> 			errors++;
> 		}
> 	}
> 
> 	if (errors) {
> 		printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total);
> 		return 1;
> 	} else {
> 		printf("[OK]\tES was preserved\n");
> 		return 0;
> 	}
> }
> 
>  ----- end es test -----
> 
>  ----- begin gsbase test -----
> 
> /*
>  * gsbase.c, a gsbase test
>  * Copyright (c) 2014 Andy Lutomirski
>  * GPL v2
>  */
> 
> static unsigned char *testptr, *testptr2;
> 
> static unsigned char read_gs_testvals(void)
> {
> 	unsigned char ret;
> 	asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr));
> 	return ret;
> }
> 
> int main()
> {
> 	int errors = 0;
> 
> 	testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE,
> 		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> 	if (testptr == MAP_FAILED)
> 		err(1, "mmap");
> 
> 	testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE,
> 		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> 	if (testptr2 == MAP_FAILED)
> 		err(1, "mmap");
> 
> 	*testptr = 0;
> 	*testptr2 = 1;
> 
> 	if (syscall(SYS_arch_prctl, ARCH_SET_GS,
> 		    (unsigned long)testptr2 - (unsigned long)testptr) != 0)
> 		err(1, "ARCH_SET_GS");
> 
> 	usleep(100);
> 
> 	if (read_gs_testvals() == 1) {
> 		printf("[OK]\tARCH_SET_GS worked\n");
> 	} else {
> 		printf("[FAIL]\tARCH_SET_GS failed\n");
> 		errors++;
> 	}
> 
> 	asm volatile ("mov %0,%%gs" : : "r" (0));
> 
> 	if (read_gs_testvals() == 0) {
> 		printf("[OK]\tWriting 0 to gs worked\n");
> 	} else {
> 		printf("[FAIL]\tWriting 0 to gs failed\n");
> 		errors++;
> 	}
> 
> 	usleep(100);
> 
> 	if (read_gs_testvals() == 0) {
> 		printf("[OK]\tgsbase is still zero\n");
> 	} else {
> 		printf("[FAIL]\tgsbase was corrupted\n");
> 		errors++;
> 	}
> 
> 	return errors == 0 ? 0 : 1;
> }
> 
>  ----- end gsbase test -----
> 
> Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> Cc: Andi Kleen <andi@xxxxxxxxxxxxxx>
> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@xxxxxxxxxxxxxx
> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
> 
> ---
>  arch/x86/kernel/process_64.c |  101 +++++++++++++++++++++++++++++++------------
>  1 file changed, 73 insertions(+), 28 deletions(-)
> 
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p,
>  
>  	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
>  
> -	/*
> -	 * Reload esp0, LDT and the page table pointer:
> -	 */
> +	/* Reload esp0 and ss1. */
>  	load_sp0(tss, next);
>  
> -	/*
> -	 * Switch DS and ES.
> -	 * This won't pick up thread selector changes, but I guess that is ok.
> -	 */
> -	savesegment(es, prev->es);
> -	if (unlikely(next->es | prev->es))
> -		loadsegment(es, next->es);
> -
> -	savesegment(ds, prev->ds);
> -	if (unlikely(next->ds | prev->ds))
> -		loadsegment(ds, next->ds);
> -
> -
>  	/* We must save %fs and %gs before load_TLS() because
>  	 * %fs and %gs may be cleared by load_TLS().
>  	 *
> @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p,
>  	savesegment(fs, fsindex);
>  	savesegment(gs, gsindex);
>  
> +	/*
> +	 * Load TLS before restoring any segments so that segment loads
> +	 * reference the correct GDT entries.
> +	 */
>  	load_TLS(next, cpu);
>  
>  	/*
> -	 * Leave lazy mode, flushing any hypercalls made here.
> -	 * This must be done before restoring TLS segments so
> -	 * the GDT and LDT are properly updated, and must be
> -	 * done before math_state_restore, so the TS bit is up
> -	 * to date.
> +	 * Leave lazy mode, flushing any hypercalls made here.  This
> +	 * must be done after loading TLS entries in the GDT but before
> +	 * loading segments that might reference them, and and it must
> +	 * be done before math_state_restore, so the TS bit is up to
> +	 * date.
>  	 */
>  	arch_end_context_switch(next_p);
>  
> +	/* Switch DS and ES.
> +	 *
> +	 * Reading them only returns the selectors, but writing them (if
> +	 * nonzero) loads the full descriptor from the GDT or LDT.  The
> +	 * LDT for next is loaded in switch_mm, and the GDT is loaded
> +	 * above.
> +	 *
> +	 * We therefore need to write new values to the segment
> +	 * registers on every context switch unless both the new and old
> +	 * values are zero.
> +	 *
> +	 * Note that we don't need to do anything for CS and SS, as
> +	 * those are saved and restored as part of pt_regs.
> +	 */
> +	savesegment(es, prev->es);
> +	if (unlikely(next->es | prev->es))
> +		loadsegment(es, next->es);
> +
> +	savesegment(ds, prev->ds);
> +	if (unlikely(next->ds | prev->ds))
> +		loadsegment(ds, next->ds);
> +
>  	/*
>  	 * Switch FS and GS.
>  	 *
> -	 * Segment register != 0 always requires a reload.  Also
> -	 * reload when it has changed.  When prev process used 64bit
> -	 * base always reload to avoid an information leak.
> +	 * These are even more complicated than FS and GS: they have
> +	 * 64-bit bases are that controlled by arch_prctl.  Those bases
> +	 * only differ from the values in the GDT or LDT if the selector
> +	 * is 0.
> +	 *
> +	 * Loading the segment register resets the hidden base part of
> +	 * the register to 0 or the value from the GDT / LDT.  If the
> +	 * next base address zero, writing 0 to the segment register is
> +	 * much faster than using wrmsr to explicitly zero the base.
> +	 *
> +	 * The thread_struct.fs and thread_struct.gs values are 0
> +	 * if the fs and gs bases respectively are not overridden
> +	 * from the values implied by fsindex and gsindex.  They
> +	 * are nonzero, and store the nonzero base addresses, if
> +	 * the bases are overridden.
> +	 *
> +	 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
> +	 * be impossible.
> +	 *
> +	 * Therefore we need to reload the segment registers if either
> +	 * the old or new selector is nonzero, and we need to override
> +	 * the base address if next thread expects it to be overridden.
> +	 *
> +	 * This code is unnecessarily slow in the case where the old and
> +	 * new indexes are zero and the new base is nonzero -- it will
> +	 * unnecessarily write 0 to the selector before writing the new
> +	 * base address.
> +	 *
> +	 * Note: This all depends on arch_prctl being the only way that
> +	 * user code can override the segment base.  Once wrfsbase and
> +	 * wrgsbase are enabled, most of this code will need to change.
>  	 */
>  	if (unlikely(fsindex | next->fsindex | prev->fs)) {
>  		loadsegment(fs, next->fsindex);
> +
>  		/*
> -		 * Check if the user used a selector != 0; if yes
> -		 *  clear 64bit base, since overloaded base is always
> -		 *  mapped to the Null selector
> +		 * If user code wrote a nonzero value to FS, then it also
> +		 * cleared the overridden base address.
> +		 *
> +		 * XXX: if user code wrote 0 to FS and cleared the base
> +		 * address itself, we won't notice and we'll incorrectly
> +		 * restore the prior base address next time we reschdule
> +		 * the process.
>  		 */
>  		if (fsindex)
>  			prev->fs = 0;
>  	}
> -	/* when next process has a 64bit base use it */
>  	if (next->fs)
>  		wrmsrl(MSR_FS_BASE, next->fs);
>  	prev->fsindex = fsindex;
>  
>  	if (unlikely(gsindex | next->gsindex | prev->gs)) {
>  		load_gs_index(next->gsindex);
> +
> +		/* This works (and fails) the same way as fsindex above. */
>  		if (gsindex)
>  			prev->gs = 0;
>  	}
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe stable" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
js
suse labs
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]