[PATCH] paravirt_ops x86_64 , take 2

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello all,

Here's a new version of the paravirt_ops x86_64 patch. With this
message, I'm sending an incremental patch. The complete patches can be
found , from now on, at  http://et.redhat.com/~gcosta/paravirt_ops/

The main aim of this new update, is to fix a critical bug, namely,
Rusty's name. However, I took the opportunity to write some new less
important pieces of code, highlighting:

* proper casts in places in which macros were replaced by functions, and
the arguments happened to mismatch types.
* calling paravirt_ops functions from .S files (I lacked this last time)
* addition of the startup_paravirt function, to kick off guests (not
tested) 
* fixed problems with patching
* added a new field, vsyscall_page in the paravirt_ops struct, which
allows the kernel to map a vsyscall_page on its own
* fixed vsyscall functions to avoid calling paravirt_ops functions.
__vsyscall_0 is the page to be mapped for the host. (set and get cpu not
yet tested.)
* fixed cpuid calls. 
* added substitute for the swapgs instruction. (Notice that I'm not
saying it works ;-) )

In my TODO list, you can find: 
* putting swapgs to work
* making sure legacy mode binaries work 
* merging in valuable commentaries from all you ;-)

-- 
Glauber de Oliveira Costa
Red Hat Inc.
"Free as in Freedom"
-------------- next part --------------
diff -urp linux-2.6.19-paravirt0/arch/i386/kernel/alternative.c linux-2.6.19-paravirt1/arch/i386/kernel/alternative.c
--- linux-2.6.19-paravirt0/arch/i386/kernel/alternative.c	2007-01-11 21:57:07.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/i386/kernel/alternative.c	2007-01-11 21:42:22.000000000 -0200
@@ -431,9 +431,7 @@ void __init alternative_instructions(voi
 	}
 #endif
 #ifdef CONFIG_PARAVIRT
-  #ifndef CONFIG_X86_64 /* Not working properly yet */
  	apply_paravirt(__start_parainstructions, __stop_parainstructions);
-  #endif
 #endif
 	local_irq_restore(flags);
 }
diff -urp linux-2.6.19-paravirt0/arch/x86_64/ia32/syscall32.c linux-2.6.19-paravirt1/arch/x86_64/ia32/syscall32.c
--- linux-2.6.19-paravirt0/arch/x86_64/ia32/syscall32.c	2007-01-11 21:51:35.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/ia32/syscall32.c	2007-01-09 11:01:19.000000000 -0200
@@ -104,5 +104,5 @@ void syscall32_cpu_init(void)
 	checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
 	checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
 
-	wrmsrl(MSR_CSTAR, ia32_cstar_target);
+	wrmsrl(MSR_CSTAR, (u64)ia32_cstar_target);
 }
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/asm-offsets.c linux-2.6.19-paravirt1/arch/x86_64/kernel/asm-offsets.c
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/asm-offsets.c	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/asm-offsets.c	2007-01-11 09:46:44.000000000 -0200
@@ -79,9 +79,10 @@ int main(void)
 	ENTRY(paravirt_enabled);
 	ENTRY(irq_disable);
 	ENTRY(irq_enable);
-	ENTRY(irq_enable_sysexit);
+	ENTRY(sysret);
 	ENTRY(iret);
-	ENTRY(read_cr0);
+	ENTRY(read_cr2);
+	ENTRY(swapgs);
 #endif
 
 	return 0;
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/entry.S linux-2.6.19-paravirt1/arch/x86_64/kernel/entry.S
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/entry.S	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/entry.S	2007-01-11 22:22:26.000000000 -0200
@@ -51,6 +51,13 @@
 #include <asm/page.h>
 #include <asm/irqflags.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define ENABLE_INTERRUPTS(x)	sti
+#define DISABLE_INTERRUPTS(x)	cli
+#define SYSRETQ			sysretq
+#endif
 	.code64
 
 #ifndef CONFIG_PREEMPT
@@ -179,6 +186,7 @@ rff_trace:
 	CFI_ENDPROC
 END(ret_from_fork)
 
+
 /*
  * System call entry. Upto 6 arguments in registers are supported.
  *
@@ -223,7 +231,7 @@ ENTRY(system_call)
 	 * No need to follow this irqs off/on section - it's straight
 	 * and short:
 	 */
-	sti					
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_ARGS 8,1
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
@@ -245,7 +253,7 @@ ret_from_sys_call:
 	/* edi:	flagmask */
 sysret_check:		
 	GET_THREAD_INFO(%rcx)
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	movl threadinfo_flags(%rcx),%edx
 	andl %edi,%edx
@@ -261,7 +269,7 @@ sysret_check:		
 	/*CFI_REGISTER	rflags,r11*/
 	movq	%gs:pda_oldrsp,%rsp
 	swapgs
-	sysretq
+	SYSRETQ
 
 	CFI_RESTORE_STATE
 	/* Handle reschedules */
@@ -270,7 +278,7 @@ sysret_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
 	call schedule
@@ -281,7 +289,7 @@ sysret_careful:
 	/* Handle a signal */ 
 sysret_signal:
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
 	jz    1f
 
@@ -294,7 +302,7 @@ sysret_signal:
 1:	movl $_TIF_NEED_RESCHED,%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
 	
@@ -326,7 +334,7 @@ tracesys:			 
  */
 	.globl int_ret_from_sys_call
 int_ret_from_sys_call:
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl $3,CS-ARGOFFSET(%rsp)
 	je retint_restore_args
@@ -347,20 +355,20 @@ int_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc  int_very_careful
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
 	call schedule
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
 
 	/* handle signals and tracing -- both require a full stack frame */
 int_very_careful:
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	/* Check for syscall exit trace */	
 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -383,7 +391,7 @@ int_signal:
 1:	movl $_TIF_NEED_RESCHED,%edi	
 int_restore_rest:
 	RESTORE_REST
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
 	CFI_ENDPROC
@@ -525,7 +533,7 @@ ENTRY(common_interrupt)
 	interrupt do_IRQ
 	/* 0(%rsp): oldrsp-ARGOFFSET */
 ret_from_intr:
-	cli	
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	decl %gs:pda_irqcount
 	leaveq
@@ -552,13 +560,13 @@ retint_swapgs:	 	
 	/*
 	 * The iretq could re-enable interrupts:
 	 */
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_IRETQ
 	swapgs 
 	jmp restore_args
 
 retint_restore_args:				
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	/*
 	 * The iretq could re-enable interrupts:
 	 */
@@ -566,35 +574,22 @@ retint_restore_args:				
 restore_args:
 	RESTORE_ARGS 0,8,0						
 iret_label:	
-	iretq
+	INTERRUPT_RETURN
 
-	.section __ex_table,"a"
-	.quad iret_label,bad_iret	
-	.previous
-	.section .fixup,"ax"
-	/* force a signal here? this matches i386 behaviour */
-	/* running with kernel gs */
-bad_iret:
-	movq $11,%rdi	/* SIGSEGV */
-	TRACE_IRQS_ON
-	sti
-	jmp do_exit			
-	.previous	
-	
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
 	bt    $TIF_NEED_RESCHED,%edx
 	jnc   retint_signal
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
 	call  schedule
 	popq %rdi		
 	CFI_ADJUST_CFA_OFFSET	-8
 	GET_THREAD_INFO(%rcx)
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp retint_check
 	
@@ -602,14 +597,14 @@ retint_signal:
 	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp) 			
 	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
 	RESTORE_REST
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
@@ -738,7 +733,7 @@ END(spurious_interrupt)
 	.if \ist
 	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
 	.endif
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	.if \irqtrace
 	TRACE_IRQS_OFF
 	.endif
@@ -770,7 +765,7 @@ paranoid_swapgs\trace:
 	swapgs
 paranoid_restore\trace:
 	RESTORE_ALL 8
-	iretq
+	INTERRUPT_RETURN
 paranoid_userspace\trace:
 	GET_THREAD_INFO(%rcx)
 	movl threadinfo_flags(%rcx),%ebx
@@ -785,11 +780,11 @@ paranoid_userspace\trace:
 	.if \trace
 	TRACE_IRQS_ON
 	.endif
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	xorl %esi,%esi 			/* arg2: oldset */
 	movq %rsp,%rdi 			/* arg1: &pt_regs */
 	call do_notify_resume
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	.if \trace
 	TRACE_IRQS_OFF
 	.endif
@@ -798,9 +793,9 @@ paranoid_schedule\trace:
 	.if \trace
 	TRACE_IRQS_ON
 	.endif
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	call schedule
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	.if \trace
 	TRACE_IRQS_OFF
 	.endif
@@ -862,7 +857,7 @@ error_sti:	
 error_exit:		
 	movl %ebx,%eax		
 	RESTORE_REST
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)	
 	testl %eax,%eax
@@ -904,7 +899,7 @@ ENTRY(load_gs_index)
 	CFI_STARTPROC
 	pushf
 	CFI_ADJUST_CFA_OFFSET 8
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
         swapgs
 gs_change:     
         movl %edi,%gs   
@@ -1065,18 +1060,32 @@ KPROBE_ENTRY(int3)
 KPROBE_END(int3)
 
 #ifdef CONFIG_PARAVIRT
+/* Not yet working. Do not use */
+ENTRY(native_swapgs)
+	swapgs
+	jmp 	%cs:(paravirt_ops+PARAVIRT_swapgs)
+ENDPROC(native_swapgs)
+
 ENTRY(native_iret)
 1:	iretq
 .section __ex_table,"a"
 	.align 8
 	.quad 1b, bad_iret
 .previous
+.section .fixup,"ax"
+/* force a signal here? this matches i386 behaviour */
+/* running with kernel gs */
+bad_iret:
+	movq $11,%rdi	/* SIGSEGV */
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	jmp do_exit
+	.previous
 ENDPROC(native_iret)
 
-ENTRY(native_irq_enable_sysexit)
-	sti
+ENTRY(native_sysret)
 	sysretq
-ENDPROC(native_irq_enable_sysexit)
+ENDPROC(native_sysret)
 
 #endif /* CONFIG_PARAVIRT */
 
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/head64.c linux-2.6.19-paravirt1/arch/x86_64/kernel/head64.c
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/head64.c	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/head64.c	2007-01-09 18:13:19.000000000 -0200
@@ -62,7 +62,7 @@ void __init x86_64_start_kernel(char * r
 
 	for (i = 0; i < IDT_ENTRIES; i++)
 		set_intr_gate(i, early_idt_handler);
-	asm volatile("lidt %0" :: "m" (idt_descr));
+	load_idt((const struct desc_struct *)&idt_descr);
 
 	early_printk("Kernel alive\n");
 
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/head.S linux-2.6.19-paravirt1/arch/x86_64/kernel/head.S
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/head.S	2006-12-11 17:32:53.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/head.S	2007-01-11 22:42:33.000000000 -0200
@@ -16,6 +16,13 @@
 #include <asm/page.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/asm-offsets.h>
+#include <asm/paravirt.h>
+#else
+#define GET_CR2_INTO_RAX mov %cr2, %rax
+#endif
 	
 /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
  * because we need identity-mapped pages on setup so define __START_KERNEL to
@@ -106,6 +113,14 @@ startup_64:
 	 * reload the page tables here.
 	 */
 
+#ifdef CONFIG_PARAVIRT
+	/* a CS ended in 0x3 indicates we're in userspace. That's where
+	 * our paravirt guests run. */
+	movq	%cs, %rax
+	testq	$0x3, %rax
+	jnz	startup_paravirt
+#endif
+
 	/* Enable PAE mode and PGE */
 	xorq	%rax, %rax
 	btsq	$5, %rax
@@ -208,10 +223,11 @@ ENTRY(early_idt_handler)
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
 	incl early_recursion_flag(%rip)
-	xorl %eax,%eax
 	movq 8(%rsp),%rsi	# get rip
 	movq (%rsp),%rdx
-	movq %cr2,%rcx
+	GET_CR2_INTO_RAX
+	movq %rax,%rcx
+	xorq %rax, %rax
 	leaq early_idt_msg(%rip),%rdi
 	call early_printk
 	cmpl $2,early_recursion_flag(%rip)
@@ -232,6 +248,47 @@ early_idt_msg:
 early_idt_ripmsg:
 	.asciz "RIP %s\n"
 
+#ifdef CONFIG_PARAVIRT
+ENTRY(startup_paravirt)
+	cld
+
+	/* initial stack location */
+ 	movq $(init_thread_union+THREAD_SIZE),%rsp
+
+	/* We take pains to preserve all the regs. */
+	pushq	%r11
+	pushq	%r10
+	pushq	%r9
+	pushq	%r8
+	pushq	%rsi
+	pushq	%rdi
+	pushq	%rdx
+	pushq	%rcx
+	pushq	%rax
+
+	/* paravirt.o is last in link, and that probe fn never returns */
+	pushq	$__start_paravirtprobe
+1:
+	movq	0(%rsp), %rax
+	pushq	(%rax)
+	movq	8(%rsp), %rdi
+	call	*(%rsp)
+	popq	%rax
+
+	movq	0x10(%rsp), %rax
+	movq	0x18(%rsp), %rcx
+	movq	0x20(%rsp), %rdx
+	movq	0x28(%rsp), %rdi
+	movq	0x30(%rsp), %rsi
+	movq	0x38(%rsp), %r8
+	movq	0x40(%rsp), %r9
+	movq	0x48(%rsp), %r10
+	movq	0x50(%rsp), %r11
+
+	addl	$8, (%rsp)
+	jmp	1b
+#endif
+
 .code32
 ENTRY(no_long_mode)
 	/* This isn't an x86-64 CPU so hang */
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/paravirt.c linux-2.6.19-paravirt1/arch/x86_64/kernel/paravirt.c
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/paravirt.c	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/paravirt.c	2007-01-11 20:10:06.000000000 -0200
@@ -1,6 +1,6 @@
 /*  Paravirtualization interfaces
     Copyright (C) 2007 Glauber de Oliveira Costa, Red Hat Inc.
-    Based on i386 work by Rusty Russel.
+    Based on i386 work by Rusty Russell.
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -59,11 +59,14 @@ void memory_setup(void)
 	asm("start_" #name ": " code "; end_" #name ":")
 DEF_NATIVE(cli, "cli");
 DEF_NATIVE(sti, "sti");
-DEF_NATIVE(popfq, "pushq %rax; popfq");
+/* We push rdi , and pop in rda. This is due to x86_64 calling conventions
+ * Recall that we are patching a function call */
+DEF_NATIVE(popfq, "pushq %rdi; popfq");
 DEF_NATIVE(pushfq, "pushfq; popq %rax");
 DEF_NATIVE(pushfq_cli, "pushfq; popq %rax; cli");
-DEF_NATIVE(iret, "iret");
-DEF_NATIVE(sti_sysretq, "sti; sysretq");
+DEF_NATIVE(iret, "iretq");
+DEF_NATIVE(sysretq, "sysretq");
+DEF_NATIVE(swapgs, "swapgs");
 
 static const struct native_insns
 {
@@ -75,7 +78,8 @@ static const struct native_insns
 	[PARAVIRT_SAVE_FLAGS] = { start_pushfq, end_pushfq },
 	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushfq_cli, end_pushfq_cli },
 	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
-	[PARAVIRT_STI_SYSRETQ] = { start_sti_sysretq, end_sti_sysretq },
+	[PARAVIRT_SYSRETQ] = { start_sysretq, end_sysretq },
+	[PARAVIRT_SWAPGS] = { start_swapgs, end_swapgs },
 };
 
 static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
@@ -88,7 +92,6 @@ static unsigned native_patch(u8 type, u1
 
 	insn_len = native_insns[type].end - native_insns[type].start;
 
-
 	/* Similarly if we can't fit replacement. */
 	if (len < insn_len)
 		return len;
@@ -243,7 +246,7 @@ static void native_wbinvd(void)
 	asm volatile("wbinvd": : :"memory");
 }
 
-static unsigned long native_read_msr(unsigned int msr, int *err)
+static u64 native_read_msr(unsigned int msr, int *err)
 {
 	unsigned long val;
 
@@ -287,6 +290,13 @@ static u64 native_read_tsc(void)
 	return val;
 }
 
+static u64 native_read_tscp(int *aux)
+{
+	u64 val;
+	asm volatile ("rdtscp" : "=A" (val), "=c" (aux));
+	return val;
+}
+
 static u64 native_read_pmc(void)
 {
 	unsigned long val;
@@ -463,7 +473,8 @@ void native_pmd_clear(pmd_t *pmd)
 
 /* These are in entry.S */
 extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
+extern void native_sysret(void);
+extern void native_swapgs(void);
 
 static int __init print_banner(void)
 {
@@ -475,12 +486,18 @@ core_initcall(print_banner);
 /* We simply declare start_kernel to be the paravirt probe of last resort. */
 paravirt_probe(start_kernel);
 
+extern unsigned long __vsyscall_0;
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 	.pgd_alignment = sizeof(pgd_t) * PTRS_PER_PGD,
 
+	.swapgs = {
+		.ret = 0,
+		.fn = native_swapgs,
+	 },
+	.vsyscall_page = &__vsyscall_0,
  	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = native_nop,
@@ -512,6 +529,7 @@ struct paravirt_ops paravirt_ops = {
 	.read_msr = native_read_msr,
 	.write_msr = native_write_msr,
 	.read_tsc = native_read_tsc,
+	.read_tscp = native_read_tscp,
 	.read_pmc = native_read_pmc,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
@@ -571,7 +589,7 @@ struct paravirt_ops paravirt_ops = {
 	.make_pud = native_make_pud,
 	.make_pgd = native_make_pgd,
 
-	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.sysret = native_sysret,
 	.iret = native_iret,
 
 	.dup_mmap = (void *)native_nop,
@@ -580,4 +598,5 @@ struct paravirt_ops paravirt_ops = {
 
 	.startup_ipi_hook = (void *)native_nop,
 };
+
 EXPORT_SYMBOL(paravirt_ops);
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/setup64.c linux-2.6.19-paravirt1/arch/x86_64/kernel/setup64.c
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/setup64.c	2006-12-11 17:32:53.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/setup64.c	2007-01-09 10:24:25.000000000 -0200
@@ -123,7 +123,7 @@ void pda_init(int cpu)
 	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
 	/* Memory clobbers used to order PDA accessed */
 	mb();
-	wrmsrl(MSR_GS_BASE, pda);
+	wrmsrl(MSR_GS_BASE, (u64)pda);
 	mb();
 
 	pda->cpunumber = cpu; 
@@ -160,7 +160,7 @@ void syscall_init(void)
 	 * but only a 32bit target. LSTAR sets the 64bit rip. 	 
 	 */ 
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
-	wrmsrl(MSR_LSTAR, system_call); 
+	wrmsrl(MSR_LSTAR, (u64)system_call); 
 
 #ifdef CONFIG_IA32_EMULATION   		
 	syscall32_cpu_init ();
@@ -223,8 +223,8 @@ void __cpuinit cpu_init (void)
  		memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
 
 	cpu_gdt_descr[cpu].size = GDT_SIZE;
-	asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
-	asm volatile("lidt %0" :: "m" (idt_descr));
+	load_gdt((const struct desc_struct *)&cpu_gdt_descr[cpu]);
+	load_idt((const struct desc_struct *)&idt_descr);
 
 	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
 	syscall_init();
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/setup.c linux-2.6.19-paravirt1/arch/x86_64/kernel/setup.c
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/setup.c	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/setup.c	2007-01-09 10:22:24.000000000 -0200
@@ -341,6 +341,12 @@ static void discover_ebda(void)
 		ebda_size = 64*1024;
 }
 
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) memory_setup(void)
+{
+       return setup_memory_region();
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	printk(KERN_INFO "Command line: %s\n", saved_command_line);
@@ -561,12 +567,6 @@ static int __cpuinit get_model_name(stru
 	return 1;
 }
 
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) memory_setup(void)
-{
-       return setup_memory_region();
-}
-
 static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, eax, ebx, ecx, edx;
diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/vsyscall.c linux-2.6.19-paravirt1/arch/x86_64/kernel/vsyscall.c
--- linux-2.6.19-paravirt0/arch/x86_64/kernel/vsyscall.c	2007-01-11 21:51:35.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/kernel/vsyscall.c	2007-01-10 06:57:22.000000000 -0200
@@ -73,7 +73,7 @@ static __always_inline void do_vgettimeo
 		usec = __xtime.tv_nsec / 1000;
 
 		if (__vxtime.mode != VXTIME_HPET) {
-			t = get_cycles_sync();
+			t = vget_cycles_sync();
 			if (t < __vxtime.last_tsc)
 				t = __vxtime.last_tsc;
 			usec += ((t - __vxtime.last_tsc) *
@@ -147,8 +147,8 @@ time_t __vsyscall(1) vtime(time_t *t)
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-	unsigned int dummy, p;
-	unsigned long j = 0;
+	unsigned int p;
+	unsigned long dummy, j = 0;
 
 	/* Fast cache - only recompute value once per jiffies and avoid
 	   relatively costly rdtscp/cpuid otherwise.
@@ -162,7 +162,8 @@ vgetcpu(unsigned *cpu, unsigned *node, s
 		p = tcache->blob[1];
 	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
 		/* Load per CPU data from RDTSCP */
-		rdtscp(dummy, dummy, p);
+		/* rdtscp() cannot be called due to the paravirt indirection */
+		asm("rdtscp" : "=A" (dummy), "=c" (p));
 	} else {
 		/* Load per CPU data from GDT */
 		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
@@ -257,7 +258,11 @@ static void __cpuinit vsyscall_set_cpu(i
 	node = cpu_to_node[cpu];
 #endif
 	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
-		write_rdtscp_aux((node << 12) | cpu);
+		/* This is write_rdtscp_aux. It cannot be called directly
+		 * due to the paravirt indirection */
+		asm("wrmsr"  :  /* no output */
+			     :  "d"(0),
+				"a" ((node << 12) | cpu), "c" (0xc0000103));
 
 	/* Store cpu number in limit so that it can be loaded quickly
 	   in user space in vgetcpu.
@@ -286,8 +291,12 @@ cpu_vsyscall_notifier(struct notifier_bl
 
 static void __init map_vsyscall(void)
 {
+#ifndef CONFIG_PARAVIRT
 	extern char __vsyscall_0;
 	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+#else
+	unsigned long physaddr_page0 = __pa_symbol(paravirt_ops.vsyscall_page);
+#endif
 
 	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
@@ -300,7 +309,14 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-	map_vsyscall();
+#ifdef CONFIG_PARAVIRT
+	if (paravirt_ops.vsyscall_page)
+#endif
+		map_vsyscall();
+#ifdef CONFIG_PARAVIRT
+	else
+		__sysctl_vsyscall = 0;
+#endif
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
 #endif
diff -urp linux-2.6.19-paravirt0/arch/x86_64/mm/pageattr.c linux-2.6.19-paravirt1/arch/x86_64/mm/pageattr.c
--- linux-2.6.19-paravirt0/arch/x86_64/mm/pageattr.c	2007-01-11 21:51:35.000000000 -0200
+++ linux-2.6.19-paravirt1/arch/x86_64/mm/pageattr.c	2007-01-09 18:02:50.000000000 -0200
@@ -81,7 +81,7 @@ static void flush_kernel_map(void *arg)
 		void *adr = page_address(pg);
 		if (cpu_has_clflush)
 			cache_flush_page(adr);
-		__flush_tlb_one(adr);
+		__flush_tlb_one((u64)adr);
 	}
 }
 
diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/alternative.h linux-2.6.19-paravirt1/include/asm-x86_64/alternative.h
--- linux-2.6.19-paravirt0/include/asm-x86_64/alternative.h	2007-01-11 21:51:36.000000000 -0200
+++ linux-2.6.19-paravirt1/include/asm-x86_64/alternative.h	2007-01-08 06:53:56.000000000 -0200
@@ -134,8 +134,10 @@ static inline void alternatives_smp_swit
 #define LOCK_PREFIX ""
 #endif
 
-struct paravirt_patch;
+
+
 #ifdef CONFIG_PARAVIRT
+struct paravirt_patch;
 void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
 #else
 static inline void
@@ -145,4 +147,5 @@ apply_paravirt(struct paravirt_patch *st
 #define __stop_parainstructions NULL
 #endif
 
+
 #endif /* _X86_64_ALTERNATIVE_H */
diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/irqflags.h linux-2.6.19-paravirt1/include/asm-x86_64/irqflags.h
--- linux-2.6.19-paravirt0/include/asm-x86_64/irqflags.h	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/include/asm-x86_64/irqflags.h	2007-01-09 17:55:54.000000000 -0200
@@ -18,7 +18,6 @@ static inline int raw_irqs_disabled_flag
 {
 	return !(flags & (1 << 9));
 }
-
 #else
 
 /*
diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/msr.h linux-2.6.19-paravirt1/include/asm-x86_64/msr.h
--- linux-2.6.19-paravirt0/include/asm-x86_64/msr.h	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/include/asm-x86_64/msr.h	2007-01-09 18:12:03.000000000 -0200
@@ -105,15 +105,6 @@ static inline void native_cpuid(unsigned
 
 #endif /* CONFIG_PARAVIRT */
 
-#define rdtscp(low,high,aux) \
-     asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux))
-
-#define rdtscpll(val, aux) do { \
-     unsigned long __a, __d; \
-     asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
-     (val) = (__d << 32) | __a; \
-} while (0)
-
 #define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
 
@@ -125,6 +116,7 @@ static inline void cpuid(unsigned int op
 	*eax = op;
 	__cpuid(eax, ebx, ecx, edx);
 }
+
 /* Some CPUID calls want 'count' to be placed in ecx */
 static inline void cpuid_count(int op, int count,
 			 int *eax, int *ebx, int *ecx, int *edx)
@@ -140,24 +132,28 @@ static inline void cpuid_count(int op, i
 static inline unsigned int cpuid_eax(unsigned int op)
 {
 	unsigned int eax, ebx, ecx, edx;
+	eax = op;
 	__cpuid(&eax, &ebx, &ecx, &edx);
 	return eax;
 }
 static inline unsigned int cpuid_ebx(unsigned int op)
 {
 	unsigned int eax, ebx, ecx, edx;
+	eax = op;
 	__cpuid(&eax, &ebx, &ecx, &edx);
 	return ebx;
 }
 static inline unsigned int cpuid_ecx(unsigned int op)
 {
 	unsigned int eax, ebx, ecx, edx;
+	eax = op;
 	__cpuid(&eax, &ebx, &ecx, &edx);
 	return ecx;
 }
 static inline unsigned int cpuid_edx(unsigned int op)
 {
 	unsigned int eax, ebx, ecx, edx;
+	eax = op;
 	__cpuid(&eax, &ebx, &ecx, &edx);
 	return edx;
 }
diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/paravirt.h linux-2.6.19-paravirt1/include/asm-x86_64/paravirt.h
--- linux-2.6.19-paravirt0/include/asm-x86_64/paravirt.h	2007-01-11 21:56:03.000000000 -0200
+++ linux-2.6.19-paravirt1/include/asm-x86_64/paravirt.h	2007-01-11 22:50:41.000000000 -0200
@@ -17,7 +17,8 @@
 #define PARAVIRT_SAVE_FLAGS 3
 #define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4
 #define PARAVIRT_INTERRUPT_RETURN 5
-#define PARAVIRT_STI_SYSRETQ 6
+#define PARAVIRT_SYSRETQ 6
+#define PARAVIRT_SWAPGS	7
 
 /* Bitmask of what can be clobbered: usually at least rax. */
 #define CLBR_NONE 0x0
@@ -34,6 +35,11 @@ struct desc_struct;
 struct tss_struct;
 struct mm_struct;
 
+struct swapgs {
+	u64 ret;
+	void (*fn)(void);
+};
+
 struct paravirt_ops
 {
 	int paravirt_enabled;
@@ -43,6 +49,9 @@ struct paravirt_ops
 
 	const char *name;
 
+	unsigned long *vsyscall_page;
+
+	struct swapgs swapgs;
 	/*
 	 * Patch may replace one of the defined code sequences with arbitrary
 	 * code, subject to the same register constraints.  This generally
@@ -89,6 +98,7 @@ struct paravirt_ops
 	void (*restore_fl)(unsigned long);
 	void (*irq_disable)(void);
 	void (*irq_enable)(void);
+
 	void (*safe_halt)(void);
 	void (*halt)(void);
 	void (*wbinvd)(void);
@@ -98,6 +108,7 @@ struct paravirt_ops
 	int (*write_msr)(unsigned int msr, u64 val);
 
 	u64 (*read_tsc)(void);
+	u64 (*read_tscp)(int *aux);
 	u64 (*read_pmc)(void);
 
 	void (*load_tr_desc)(void);
@@ -167,7 +178,7 @@ struct paravirt_ops
 	void (*set_lazy_mode)(int mode);
 
 	/* These two are jmp to, not actually called. */
-	void (*irq_enable_sysexit)(void);
+	void (*sysret)(void);
 	void (*iret)(void);
 
 	void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp);
@@ -262,6 +273,14 @@ static inline void halt(void)
 	val2 = _l >> 32;					\
 } while(0)
 
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({					\
+	int _err;						\
+	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
+	(*a) = (u32)_l;						\
+	(*b) = _l >> 32;					\
+	_err; })
+
 #define wrmsr(msr,val1,val2) do {				\
 	u64 _l = ((u64)(val2) << 32) | (val1);			\
 	paravirt_ops.write_msr((msr), _l);			\
@@ -273,19 +292,12 @@ static inline void halt(void)
 } while(0)
 
 #define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+
 #define wrmsr_safe(msr,a,b) ({					\
 	u64 _l = ((u64)(b) << 32) | (a);			\
 	paravirt_ops.write_msr((msr),_l);			\
 })
 
-/* rdmsr with exception handling */
-#define rdmsr_safe(msr,a,b) ({					\
-	int _err;						\
-	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
-	(*a) = (u32)_l;						\
-	(*b) = _l >> 32;					\
-	_err; })
-
 #define rdtsc(low,high) do {					\
 	u64 _l = paravirt_ops.read_tsc();			\
 	low = (u32)_l;						\
@@ -299,6 +311,14 @@ static inline void halt(void)
 
 #define rdtscll(val) (val = paravirt_ops.read_tsc())
 
+#define rdtscp(low,high,aux) do {				\
+	u64 _val = paravirt_ops.read_tscp(&aux);		\
+	low = (int)_val;					\
+	high = _val >> 32;					\
+} while (0)
+
+#define rdtscpll(val, aux) (val) = paravirt_ops.read_tscp(&aux)
+
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
 
 #define rdpmc(counter,low,high) do {				\
@@ -375,7 +395,6 @@ void native_pte_clear(struct mm_struct *
 void native_pmd_clear(pmd_t *pmd);
 void native_nop(void);
 
-
 static inline void paravirt_activate_mm(struct mm_struct *prev,
 					struct mm_struct *next)
 {
@@ -483,6 +502,9 @@ struct paravirt_patch {
 	"  .short " __stringify(clobber) "\n"		\
 	".popsection"
 
+/* These functions tends to be very simple. So, if they touch any register,
+ * the calle-saved ones may already fulfill their needs, and hopefully we
+ * have no need to save any. */
 static inline unsigned long __raw_local_save_flags(void)
 {
 	unsigned long f;
@@ -533,18 +555,12 @@ static inline unsigned long __raw_local_
 	return f;
 }
 
+#define CLI_STRING paravirt_alt("call *paravirt_ops+%c[irq_disable];",	\
+		     PARAVIRT_IRQ_DISABLE, CLBR_NONE)
 
+#define STI_STRING paravirt_alt("call *paravirt_ops+%c[irq_enable];",	\
+		     PARAVIRT_IRQ_ENABLE, CLBR_NONE)
 
-/* Still x86-ish */
-#define CLI_STRING paravirt_alt("pushq %%rcx; pushq %%rdx;"		\
-		     "call *paravirt_ops+%c[irq_disable];"		\
-		     "popq %%rdx; popq %%rcx",				\
-		     PARAVIRT_IRQ_DISABLE, CLBR_RAX)
-
-#define STI_STRING paravirt_alt("pushq %%rcx; pushq %%rdx;"		\
-		     "call *paravirt_ops+%c[irq_enable];"		\
-		     "popq %%rdx; popq %%rcx",				\
-		     PARAVIRT_IRQ_ENABLE, CLBR_RAX)
 #define CLI_STI_CLOBBERS , "%rax"
 #define CLI_STI_INPUT_ARGS \
 	,								\
@@ -571,22 +587,23 @@ static inline unsigned long __raw_local_
 
 #define DISABLE_INTERRUPTS(clobbers)			\
 	PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers,	\
-	pushq %rcx; pushq %rdx;				\
-	call *paravirt_ops+PARAVIRT_irq_disable;	\
-	popq %rdx; popq %rcx)				\
+	call *paravirt_ops+PARAVIRT_irq_disable)
 
 #define ENABLE_INTERRUPTS(clobbers)			\
 	PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers,	\
-	pushq %rcx; pushq %rdx;				\
-	call *%cs:paravirt_ops+PARAVIRT_irq_enable;	\
-	popq %rdx; popq %rcx)
-
-#define ENABLE_INTERRUPTS_SYSRETQ			\
-	PARA_PATCH(PARAVIRT_STI_SYSRETQ, CLBR_ANY,	\
-	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)
+	call *%cs:paravirt_ops+PARAVIRT_irq_enable)
 
-#define GET_CR0_INTO_RAX			\
-	call *paravirt_ops+PARAVIRT_read_cr0
+#define SYSRETQ						\
+	PARA_PATCH(PARAVIRT_SYSRETQ, CLBR_ANY,		\
+	jmp *%cs:paravirt_ops+PARAVIRT_sysret)
+
+#define SWAPGS						\
+	movq $. + 0x11, (paravirt_ops+PARAVIRT_swapgs);	\
+	jmp  (paravirt_ops+PARAVIRT_swapgs+8);		\
+
+/* this is needed in early_idt_handler */
+#define GET_CR2_INTO_RAX 				\
+	call *paravirt_ops+PARAVIRT_read_cr2
 
 #endif /* __ASSEMBLY__ */
 #else  /* !CONFIG_PARAVIRT */
diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/timex.h linux-2.6.19-paravirt1/include/asm-x86_64/timex.h
--- linux-2.6.19-paravirt0/include/asm-x86_64/timex.h	2006-12-11 17:32:53.000000000 -0200
+++ linux-2.6.19-paravirt1/include/asm-x86_64/timex.h	2007-01-10 15:10:00.000000000 -0200
@@ -31,14 +31,29 @@ static __always_inline cycles_t get_cycl
 {
 	unsigned long long ret;
 	unsigned eax;
+	unsigned int (*fn)(unsigned int) = &cpuid_eax;
 	/* Don't do an additional sync on CPUs where we know
 	   RDTSC is already synchronous. */
-	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
-			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
+	alternative_io("call *%3", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
+			"=a" (eax) , "D" (1) , "m" (fn));
 	rdtscll(ret);
 	return ret;
 }
 
+/* Inside a vsyscall, we cannot call paravirt functions. (like rdtsc
+ * and cpuid). For the host, use this function instead */
+static __always_inline cycles_t vget_cycles_sync(void)
+{
+	unsigned long ret;
+	unsigned eax;
+	/* Don't do an additional sync on CPUs where we know
+	   RDTSC is already synchronous. */
+	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
+			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
+
+	asm volatile("rdtsc" : "=A" (ret));
+	return ret;
+}
 extern unsigned int cpu_khz;
 
 extern int read_current_timer(unsigned long *timer_value);


[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux