Hi all, I've been looking at finding common ground between the VMI, Xen and other paravirtualization approaches, and after some discussion, we're getting somewhere. These first two patches are the fundamentals, stolen mainly from the VMI patches: removing assumptions about the kernel running in ring 0, and macro-izing all the obvious para-virtualize-needing insns. The third patch is more ambitious: it introduces a "paravirt_ops" structure (a-la PPC's ppc_md) through which all these ops are indirected. This should allow Xen, VMI and other variants to build on a common base. These patches also live at http://kernel.org/pub/linux/kernel/people/rusty/Paravirt Feedback welcome! Rusty. Name: Kernel Ring Cleanups Status: Booted on 2.6.16-rc2-git7 Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx> This is Zach's patch to clean up assumptions about the kernel running in ring 0 (which it doesn't when running paravirtualized). 1) Remove the hardcoded 3 and introduce #define SEGMENT_RPL_MASK 3 2) Add a get_kernel_rpl() function 3) Create COMPARE_SEGMENT_STACK and COMPARE_SEGMENT_REG macros which can mask out the bottom two bits (RPL) when comparing for paravirtualization. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/arch/i386/kernel/entry.S tmp/arch/i386/kernel/entry.S --- linux-2.6.17-rc2-git7/arch/i386/kernel/entry.S 2006-04-21 12:05:02.000000000 +1000 +++ tmp/arch/i386/kernel/entry.S 2006-05-02 16:00:17.000000000 +1000 @@ -144,9 +144,11 @@ ret_from_exception: ret_from_intr: GET_THREAD_INFO(%ebp) movl EFLAGS(%esp), %eax # mix EFLAGS and CS + andl $VM_MASK, %eax movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + andb $SEGMENT_RPL_MASK, %al + cmpl $SEGMENT_RPL_MASK, %eax + jb resume_kernel # returning to kernel or vm86-space ENTRY(resume_userspace) cli # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -386,17 +388,14 @@ syscall_badsys: /* put ESP to the proper location */ \ movl %eax, %esp; #define UNWIND_ESPFIX_STACK \ - pushl %eax; \ - movl %ss, %eax; \ - /* see if on 16bit stack */ \ - cmpw $__ESPFIX_SS, %ax; \ + COMPARE_SEGMENT_REG(__ESPFIX_SS, %ss); \ jne 28f; \ - movl $__KERNEL_DS, %edx; \ + movl $__USER_DS, %edx; \ movl %edx, %ds; \ movl %edx, %es; \ /* switch to 32bit stack */ \ FIXUP_ESPFIX_STACK \ -28: popl %eax; +28:; /* * Build the entry stubs and pointer table with @@ -455,6 +454,7 @@ error_code: pushl %es UNWIND_ESPFIX_STACK popl %ecx + movl EAX(%esp), %eax movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code movl %eax, ORIG_EAX(%esp) @@ -505,12 +505,12 @@ device_not_available_emulate: * the instruction that would have done it for sysenter. */ #define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ + COMPARE_SEGMENT_STACK(__KERNEL_CS, 4); \ jne ok; \ label: \ movl TSS_sysenter_esp0+offset(%esp),%esp; \ pushfl; \ - pushl $__KERNEL_CS; \ + push %cs; \ pushl $sysenter_past_esp KPROBE_ENTRY(debug) @@ -534,10 +534,7 @@ debug_stack_correct: * fault happened on the sysenter path. */ ENTRY(nmi) - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax + COMPARE_SEGMENT_REG(__ESPFIX_SS, %ss) je nmi_16bit_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup @@ -564,7 +561,7 @@ nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) jmp nmi_stack_correct nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) + COMPARE_SEGMENT_STACK(__KERNEL_CS, 16) jne nmi_stack_correct cmpl $debug,(%esp) jb nmi_stack_correct diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/arch/i386/kernel/process.c tmp/arch/i386/kernel/process.c --- linux-2.6.17-rc2-git7/arch/i386/kernel/process.c 2006-04-21 12:05:02.000000000 +1000 +++ tmp/arch/i386/kernel/process.c 2006-05-02 15:57:41.000000000 +1000 @@ -347,7 +347,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/arch/i386/kernel/traps.c tmp/arch/i386/kernel/traps.c --- linux-2.6.17-rc2-git7/arch/i386/kernel/traps.c 2006-04-21 12:05:02.000000000 +1000 +++ tmp/arch/i386/kernel/traps.c 2006-05-02 15:57:41.000000000 +1000 @@ -1013,10 +1013,10 @@ fastcall void setup_x86_bogus_stack(unsi memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); /* fill in the switch pointers */ switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; - switch16_ptr[1] = __ESPFIX_SS; + switch16_ptr[1] = __ESPFIX_SS | get_kernel_rpl(); switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + 8 - CPU_16BIT_STACK_SIZE; - switch32_ptr[1] = __KERNEL_DS; + switch32_ptr[1] = __KERNEL_DS | get_kernel_rpl(); } fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/include/asm-i386/ptrace.h tmp/include/asm-i386/ptrace.h --- linux-2.6.17-rc2-git7/include/asm-i386/ptrace.h 2006-03-23 12:44:59.000000000 +1100 +++ tmp/include/asm-i386/ptrace.h 2006-05-02 15:57:41.000000000 +1000 @@ -60,6 +60,7 @@ struct pt_regs { #ifdef __KERNEL__ #include <asm/vm86.h> +#include <asm/segment.h> struct task_struct; extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); @@ -73,11 +74,11 @@ extern void send_sigtrap(struct task_str */ static inline int user_mode(struct pt_regs *regs) { - return (regs->xcs & 3) != 0; + return (regs->xcs & SEGMENT_RPL_MASK) == 3; } static inline int user_mode_vm(struct pt_regs *regs) { - return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0; + return (((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= 3); } #define instruction_pointer(regs) ((regs)->eip) #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.6.17-rc2-git7/include/asm-i386/segment.h tmp/include/asm-i386/segment.h --- linux-2.6.17-rc2-git7/include/asm-i386/segment.h 2006-03-23 12:44:59.000000000 +1100 +++ tmp/include/asm-i386/segment.h 2006-05-02 15:57:41.000000000 +1000 @@ -112,4 +112,18 @@ */ #define IDT_ENTRIES 256 +/* Bottom three bits of xcs give the ring privilege level */ +#define SEGMENT_RPL_MASK 0x3 + +#define get_kernel_rpl() 0 + +#define COMPARE_SEGMENT_STACK(segment, offset) \ + cmpw $segment, offset(%esp); + +#define COMPARE_SEGMENT_REG(segment, reg) \ + pushl %eax; \ + mov reg, %eax; \ + cmpw $segment,%ax; \ + popl %eax; + #endif -- ccontrol: http://ccontrol.ozlabs.org