Hello all, Here's a new version of the paravirt_ops x86_64 patch. With this message, I'm sending an incremental patch. The complete patches can be found , from now on, at http://et.redhat.com/~gcosta/paravirt_ops/ The main aim of this new update, is to fix a critical bug, namely, Rusty's name. However, I took the opportunity to write some new less important pieces of code, highlighting: * proper casts in places in which macros were replaced by functions, and the arguments happened to mismatch types. * calling paravirt_ops functions from .S files (I lacked this last time) * addition of the startup_paravirt function, to kick off guests (not tested) * fixed problems with patching * added a new field, vsyscall_page in the paravirt_ops struct, which allows the kernel to map a vsyscall_page on its own * fixed vsyscall functions to avoid calling paravirt_ops functions. __vsyscall_0 is the page to be mapped for the host. (set and get cpu not yet tested.) * fixed cpuid calls. * added substitute for the swapgs instruction. (Notice that I'm not saying it works ;-) ) In my TODO list, you can find: * putting swapgs to work * making sure legacy mode binaries work * merging in valuable commentaries from all you ;-) -- Glauber de Oliveira Costa Red Hat Inc. "Free as in Freedom" -------------- next part -------------- diff -urp linux-2.6.19-paravirt0/arch/i386/kernel/alternative.c linux-2.6.19-paravirt1/arch/i386/kernel/alternative.c --- linux-2.6.19-paravirt0/arch/i386/kernel/alternative.c 2007-01-11 21:57:07.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/i386/kernel/alternative.c 2007-01-11 21:42:22.000000000 -0200 @@ -431,9 +431,7 @@ void __init alternative_instructions(voi } #endif #ifdef CONFIG_PARAVIRT - #ifndef CONFIG_X86_64 /* Not working properly yet */ apply_paravirt(__start_parainstructions, __stop_parainstructions); - #endif #endif local_irq_restore(flags); } diff -urp linux-2.6.19-paravirt0/arch/x86_64/ia32/syscall32.c linux-2.6.19-paravirt1/arch/x86_64/ia32/syscall32.c --- linux-2.6.19-paravirt0/arch/x86_64/ia32/syscall32.c 2007-01-11 21:51:35.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/ia32/syscall32.c 2007-01-09 11:01:19.000000000 -0200 @@ -104,5 +104,5 @@ void syscall32_cpu_init(void) checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - wrmsrl(MSR_CSTAR, ia32_cstar_target); + wrmsrl(MSR_CSTAR, (u64)ia32_cstar_target); } diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/asm-offsets.c linux-2.6.19-paravirt1/arch/x86_64/kernel/asm-offsets.c --- linux-2.6.19-paravirt0/arch/x86_64/kernel/asm-offsets.c 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/asm-offsets.c 2007-01-11 09:46:44.000000000 -0200 @@ -79,9 +79,10 @@ int main(void) ENTRY(paravirt_enabled); ENTRY(irq_disable); ENTRY(irq_enable); - ENTRY(irq_enable_sysexit); + ENTRY(sysret); ENTRY(iret); - ENTRY(read_cr0); + ENTRY(read_cr2); + ENTRY(swapgs); #endif return 0; diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/entry.S linux-2.6.19-paravirt1/arch/x86_64/kernel/entry.S --- linux-2.6.19-paravirt0/arch/x86_64/kernel/entry.S 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/entry.S 2007-01-11 22:22:26.000000000 -0200 @@ -51,6 +51,13 @@ #include <asm/page.h> #include <asm/irqflags.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define ENABLE_INTERRUPTS(x) sti +#define DISABLE_INTERRUPTS(x) cli +#define SYSRETQ sysretq +#endif .code64 #ifndef CONFIG_PREEMPT @@ -179,6 +186,7 @@ rff_trace: CFI_ENDPROC END(ret_from_fork) + /* * System call entry. Upto 6 arguments in registers are supported. * @@ -223,7 +231,7 @@ ENTRY(system_call) * No need to follow this irqs off/on section - it's straight * and short: */ - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) @@ -245,7 +253,7 @@ ret_from_sys_call: /* edi: flagmask */ sysret_check: GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -261,7 +269,7 @@ sysret_check: /*CFI_REGISTER rflags,r11*/ movq %gs:pda_oldrsp,%rsp swapgs - sysretq + SYSRETQ CFI_RESTORE_STATE /* Handle reschedules */ @@ -270,7 +278,7 @@ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule @@ -281,7 +289,7 @@ sysret_careful: /* Handle a signal */ sysret_signal: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz 1f @@ -294,7 +302,7 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -326,7 +334,7 @@ tracesys: */ .globl int_ret_from_sys_call int_ret_from_sys_call: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args @@ -347,20 +355,20 @@ int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx @@ -383,7 +391,7 @@ int_signal: 1: movl $_TIF_NEED_RESCHED,%edi int_restore_rest: RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC @@ -525,7 +533,7 @@ ENTRY(common_interrupt) interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl %gs:pda_irqcount leaveq @@ -552,13 +560,13 @@ retint_swapgs: /* * The iretq could re-enable interrupts: */ - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_IRETQ swapgs jmp restore_args retint_restore_args: - cli + DISABLE_INTERRUPTS(CLBR_NONE) /* * The iretq could re-enable interrupts: */ @@ -566,35 +574,22 @@ retint_restore_args: restore_args: RESTORE_ARGS 0,8,0 iret_label: - iretq + INTERRUPT_RETURN - .section __ex_table,"a" - .quad iret_label,bad_iret - .previous - .section .fixup,"ax" - /* force a signal here? this matches i386 behaviour */ - /* running with kernel gs */ -bad_iret: - movq $11,%rdi /* SIGSEGV */ - TRACE_IRQS_ON - sti - jmp do_exit - .previous - /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check @@ -602,14 +597,14 @@ retint_signal: testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz retint_swapgs TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) @@ -738,7 +733,7 @@ END(spurious_interrupt) .if \ist addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \irqtrace TRACE_IRQS_OFF .endif @@ -770,7 +765,7 @@ paranoid_swapgs\trace: swapgs paranoid_restore\trace: RESTORE_ALL 8 - iretq + INTERRUPT_RETURN paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -785,11 +780,11 @@ paranoid_userspace\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_NONE) xorl %esi,%esi /* arg2: oldset */ movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \trace TRACE_IRQS_OFF .endif @@ -798,9 +793,9 @@ paranoid_schedule\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_NONE) call schedule - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \trace TRACE_IRQS_OFF .endif @@ -862,7 +857,7 @@ error_sti: error_exit: movl %ebx,%eax RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax @@ -904,7 +899,7 @@ ENTRY(load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 - cli + DISABLE_INTERRUPTS(CLBR_NONE) swapgs gs_change: movl %edi,%gs @@ -1065,18 +1060,32 @@ KPROBE_ENTRY(int3) KPROBE_END(int3) #ifdef CONFIG_PARAVIRT +/* Not yet working. Do not use */ +ENTRY(native_swapgs) + swapgs + jmp %cs:(paravirt_ops+PARAVIRT_swapgs) +ENDPROC(native_swapgs) + ENTRY(native_iret) 1: iretq .section __ex_table,"a" .align 8 .quad 1b, bad_iret .previous +.section .fixup,"ax" +/* force a signal here? this matches i386 behaviour */ +/* running with kernel gs */ +bad_iret: + movq $11,%rdi /* SIGSEGV */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + jmp do_exit + .previous ENDPROC(native_iret) -ENTRY(native_irq_enable_sysexit) - sti +ENTRY(native_sysret) sysretq -ENDPROC(native_irq_enable_sysexit) +ENDPROC(native_sysret) #endif /* CONFIG_PARAVIRT */ diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/head64.c linux-2.6.19-paravirt1/arch/x86_64/kernel/head64.c --- linux-2.6.19-paravirt0/arch/x86_64/kernel/head64.c 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/head64.c 2007-01-09 18:13:19.000000000 -0200 @@ -62,7 +62,7 @@ void __init x86_64_start_kernel(char * r for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); - asm volatile("lidt %0" :: "m" (idt_descr)); + load_idt((const struct desc_struct *)&idt_descr); early_printk("Kernel alive\n"); diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/head.S linux-2.6.19-paravirt1/arch/x86_64/kernel/head.S --- linux-2.6.19-paravirt0/arch/x86_64/kernel/head.S 2006-12-11 17:32:53.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/head.S 2007-01-11 22:42:33.000000000 -0200 @@ -16,6 +16,13 @@ #include <asm/page.h> #include <asm/msr.h> #include <asm/cache.h> + +#ifdef CONFIG_PARAVIRT +#include <asm/asm-offsets.h> +#include <asm/paravirt.h> +#else +#define GET_CR2_INTO_RAX mov %cr2, %rax +#endif /* we are not able to switch in one step to the final KERNEL ADRESS SPACE * because we need identity-mapped pages on setup so define __START_KERNEL to @@ -106,6 +113,14 @@ startup_64: * reload the page tables here. */ +#ifdef CONFIG_PARAVIRT + /* a CS ended in 0x3 indicates we're in userspace. That's where + * our paravirt guests run. */ + movq %cs, %rax + testq $0x3, %rax + jnz startup_paravirt +#endif + /* Enable PAE mode and PGE */ xorq %rax, %rax btsq $5, %rax @@ -208,10 +223,11 @@ ENTRY(early_idt_handler) cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) - xorl %eax,%eax movq 8(%rsp),%rsi # get rip movq (%rsp),%rdx - movq %cr2,%rcx + GET_CR2_INTO_RAX + movq %rax,%rcx + xorq %rax, %rax leaq early_idt_msg(%rip),%rdi call early_printk cmpl $2,early_recursion_flag(%rip) @@ -232,6 +248,47 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" +#ifdef CONFIG_PARAVIRT +ENTRY(startup_paravirt) + cld + + /* initial stack location */ + movq $(init_thread_union+THREAD_SIZE),%rsp + + /* We take pains to preserve all the regs. */ + pushq %r11 + pushq %r10 + pushq %r9 + pushq %r8 + pushq %rsi + pushq %rdi + pushq %rdx + pushq %rcx + pushq %rax + + /* paravirt.o is last in link, and that probe fn never returns */ + pushq $__start_paravirtprobe +1: + movq 0(%rsp), %rax + pushq (%rax) + movq 8(%rsp), %rdi + call *(%rsp) + popq %rax + + movq 0x10(%rsp), %rax + movq 0x18(%rsp), %rcx + movq 0x20(%rsp), %rdx + movq 0x28(%rsp), %rdi + movq 0x30(%rsp), %rsi + movq 0x38(%rsp), %r8 + movq 0x40(%rsp), %r9 + movq 0x48(%rsp), %r10 + movq 0x50(%rsp), %r11 + + addl $8, (%rsp) + jmp 1b +#endif + .code32 ENTRY(no_long_mode) /* This isn't an x86-64 CPU so hang */ diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/paravirt.c linux-2.6.19-paravirt1/arch/x86_64/kernel/paravirt.c --- linux-2.6.19-paravirt0/arch/x86_64/kernel/paravirt.c 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/paravirt.c 2007-01-11 20:10:06.000000000 -0200 @@ -1,6 +1,6 @@ /* Paravirtualization interfaces Copyright (C) 2007 Glauber de Oliveira Costa, Red Hat Inc. - Based on i386 work by Rusty Russel. + Based on i386 work by Rusty Russell. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -59,11 +59,14 @@ void memory_setup(void) asm("start_" #name ": " code "; end_" #name ":") DEF_NATIVE(cli, "cli"); DEF_NATIVE(sti, "sti"); -DEF_NATIVE(popfq, "pushq %rax; popfq"); +/* We push rdi , and pop in rda. This is due to x86_64 calling conventions + * Recall that we are patching a function call */ +DEF_NATIVE(popfq, "pushq %rdi; popfq"); DEF_NATIVE(pushfq, "pushfq; popq %rax"); DEF_NATIVE(pushfq_cli, "pushfq; popq %rax; cli"); -DEF_NATIVE(iret, "iret"); -DEF_NATIVE(sti_sysretq, "sti; sysretq"); +DEF_NATIVE(iret, "iretq"); +DEF_NATIVE(sysretq, "sysretq"); +DEF_NATIVE(swapgs, "swapgs"); static const struct native_insns { @@ -75,7 +78,8 @@ static const struct native_insns [PARAVIRT_SAVE_FLAGS] = { start_pushfq, end_pushfq }, [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushfq_cli, end_pushfq_cli }, [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, - [PARAVIRT_STI_SYSRETQ] = { start_sti_sysretq, end_sti_sysretq }, + [PARAVIRT_SYSRETQ] = { start_sysretq, end_sysretq }, + [PARAVIRT_SWAPGS] = { start_swapgs, end_swapgs }, }; static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) @@ -88,7 +92,6 @@ static unsigned native_patch(u8 type, u1 insn_len = native_insns[type].end - native_insns[type].start; - /* Similarly if we can't fit replacement. */ if (len < insn_len) return len; @@ -243,7 +246,7 @@ static void native_wbinvd(void) asm volatile("wbinvd": : :"memory"); } -static unsigned long native_read_msr(unsigned int msr, int *err) +static u64 native_read_msr(unsigned int msr, int *err) { unsigned long val; @@ -287,6 +290,13 @@ static u64 native_read_tsc(void) return val; } +static u64 native_read_tscp(int *aux) +{ + u64 val; + asm volatile ("rdtscp" : "=A" (val), "=c" (aux)); + return val; +} + static u64 native_read_pmc(void) { unsigned long val; @@ -463,7 +473,8 @@ void native_pmd_clear(pmd_t *pmd) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_sysexit(void); +extern void native_sysret(void); +extern void native_swapgs(void); static int __init print_banner(void) { @@ -475,12 +486,18 @@ core_initcall(print_banner); /* We simply declare start_kernel to be the paravirt probe of last resort. */ paravirt_probe(start_kernel); +extern unsigned long __vsyscall_0; struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, .pgd_alignment = sizeof(pgd_t) * PTRS_PER_PGD, + .swapgs = { + .ret = 0, + .fn = native_swapgs, + }, + .vsyscall_page = &__vsyscall_0, .patch = native_patch, .banner = default_banner, .arch_setup = native_nop, @@ -512,6 +529,7 @@ struct paravirt_ops paravirt_ops = { .read_msr = native_read_msr, .write_msr = native_write_msr, .read_tsc = native_read_tsc, + .read_tscp = native_read_tscp, .read_pmc = native_read_pmc, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, @@ -571,7 +589,7 @@ struct paravirt_ops paravirt_ops = { .make_pud = native_make_pud, .make_pgd = native_make_pgd, - .irq_enable_sysexit = native_irq_enable_sysexit, + .sysret = native_sysret, .iret = native_iret, .dup_mmap = (void *)native_nop, @@ -580,4 +598,5 @@ struct paravirt_ops paravirt_ops = { .startup_ipi_hook = (void *)native_nop, }; + EXPORT_SYMBOL(paravirt_ops); diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/setup64.c linux-2.6.19-paravirt1/arch/x86_64/kernel/setup64.c --- linux-2.6.19-paravirt0/arch/x86_64/kernel/setup64.c 2006-12-11 17:32:53.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/setup64.c 2007-01-09 10:24:25.000000000 -0200 @@ -123,7 +123,7 @@ void pda_init(int cpu) asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); /* Memory clobbers used to order PDA accessed */ mb(); - wrmsrl(MSR_GS_BASE, pda); + wrmsrl(MSR_GS_BASE, (u64)pda); mb(); pda->cpunumber = cpu; @@ -160,7 +160,7 @@ void syscall_init(void) * but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_LSTAR, (u64)system_call); #ifdef CONFIG_IA32_EMULATION syscall32_cpu_init (); @@ -223,8 +223,8 @@ void __cpuinit cpu_init (void) memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); cpu_gdt_descr[cpu].size = GDT_SIZE; - asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); - asm volatile("lidt %0" :: "m" (idt_descr)); + load_gdt((const struct desc_struct *)&cpu_gdt_descr[cpu]); + load_idt((const struct desc_struct *)&idt_descr); memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/setup.c linux-2.6.19-paravirt1/arch/x86_64/kernel/setup.c --- linux-2.6.19-paravirt0/arch/x86_64/kernel/setup.c 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/setup.c 2007-01-09 10:22:24.000000000 -0200 @@ -341,6 +341,12 @@ static void discover_ebda(void) ebda_size = 64*1024; } +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +void __attribute__((weak)) memory_setup(void) +{ + return setup_memory_region(); +} + void __init setup_arch(char **cmdline_p) { printk(KERN_INFO "Command line: %s\n", saved_command_line); @@ -561,12 +567,6 @@ static int __cpuinit get_model_name(stru return 1; } -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -void __attribute__((weak)) memory_setup(void) -{ - return setup_memory_region(); -} - static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, eax, ebx, ecx, edx; diff -urp linux-2.6.19-paravirt0/arch/x86_64/kernel/vsyscall.c linux-2.6.19-paravirt1/arch/x86_64/kernel/vsyscall.c --- linux-2.6.19-paravirt0/arch/x86_64/kernel/vsyscall.c 2007-01-11 21:51:35.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/kernel/vsyscall.c 2007-01-10 06:57:22.000000000 -0200 @@ -73,7 +73,7 @@ static __always_inline void do_vgettimeo usec = __xtime.tv_nsec / 1000; if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); + t = vget_cycles_sync(); if (t < __vxtime.last_tsc) t = __vxtime.last_tsc; usec += ((t - __vxtime.last_tsc) * @@ -147,8 +147,8 @@ time_t __vsyscall(1) vtime(time_t *t) long __vsyscall(2) vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - unsigned int dummy, p; - unsigned long j = 0; + unsigned int p; + unsigned long dummy, j = 0; /* Fast cache - only recompute value once per jiffies and avoid relatively costly rdtscp/cpuid otherwise. @@ -162,7 +162,8 @@ vgetcpu(unsigned *cpu, unsigned *node, s p = tcache->blob[1]; } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); + /* rdtscp() cannot be called due to the paravirt indirection */ + asm("rdtscp" : "=A" (dummy), "=c" (p)); } else { /* Load per CPU data from GDT */ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); @@ -257,7 +258,11 @@ static void __cpuinit vsyscall_set_cpu(i node = cpu_to_node[cpu]; #endif if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); + /* This is write_rdtscp_aux. It cannot be called directly + * due to the paravirt indirection */ + asm("wrmsr" : /* no output */ + : "d"(0), + "a" ((node << 12) | cpu), "c" (0xc0000103)); /* Store cpu number in limit so that it can be loaded quickly in user space in vgetcpu. @@ -286,8 +291,12 @@ cpu_vsyscall_notifier(struct notifier_bl static void __init map_vsyscall(void) { +#ifndef CONFIG_PARAVIRT extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); +#else + unsigned long physaddr_page0 = __pa_symbol(paravirt_ops.vsyscall_page); +#endif /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); @@ -300,7 +309,14 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); - map_vsyscall(); +#ifdef CONFIG_PARAVIRT + if (paravirt_ops.vsyscall_page) +#endif + map_vsyscall(); +#ifdef CONFIG_PARAVIRT + else + __sysctl_vsyscall = 0; +#endif #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); #endif diff -urp linux-2.6.19-paravirt0/arch/x86_64/mm/pageattr.c linux-2.6.19-paravirt1/arch/x86_64/mm/pageattr.c --- linux-2.6.19-paravirt0/arch/x86_64/mm/pageattr.c 2007-01-11 21:51:35.000000000 -0200 +++ linux-2.6.19-paravirt1/arch/x86_64/mm/pageattr.c 2007-01-09 18:02:50.000000000 -0200 @@ -81,7 +81,7 @@ static void flush_kernel_map(void *arg) void *adr = page_address(pg); if (cpu_has_clflush) cache_flush_page(adr); - __flush_tlb_one(adr); + __flush_tlb_one((u64)adr); } } diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/alternative.h linux-2.6.19-paravirt1/include/asm-x86_64/alternative.h --- linux-2.6.19-paravirt0/include/asm-x86_64/alternative.h 2007-01-11 21:51:36.000000000 -0200 +++ linux-2.6.19-paravirt1/include/asm-x86_64/alternative.h 2007-01-08 06:53:56.000000000 -0200 @@ -134,8 +134,10 @@ static inline void alternatives_smp_swit #define LOCK_PREFIX "" #endif -struct paravirt_patch; + + #ifdef CONFIG_PARAVIRT +struct paravirt_patch; void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); #else static inline void @@ -145,4 +147,5 @@ apply_paravirt(struct paravirt_patch *st #define __stop_parainstructions NULL #endif + #endif /* _X86_64_ALTERNATIVE_H */ diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/irqflags.h linux-2.6.19-paravirt1/include/asm-x86_64/irqflags.h --- linux-2.6.19-paravirt0/include/asm-x86_64/irqflags.h 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/include/asm-x86_64/irqflags.h 2007-01-09 17:55:54.000000000 -0200 @@ -18,7 +18,6 @@ static inline int raw_irqs_disabled_flag { return !(flags & (1 << 9)); } - #else /* diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/msr.h linux-2.6.19-paravirt1/include/asm-x86_64/msr.h --- linux-2.6.19-paravirt0/include/asm-x86_64/msr.h 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/include/asm-x86_64/msr.h 2007-01-09 18:12:03.000000000 -0200 @@ -105,15 +105,6 @@ static inline void native_cpuid(unsigned #endif /* CONFIG_PARAVIRT */ -#define rdtscp(low,high,aux) \ - asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux)) - -#define rdtscpll(val, aux) do { \ - unsigned long __a, __d; \ - asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \ - (val) = (__d << 32) | __a; \ -} while (0) - #define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32)) #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) @@ -125,6 +116,7 @@ static inline void cpuid(unsigned int op *eax = op; __cpuid(eax, ebx, ecx, edx); } + /* Some CPUID calls want 'count' to be placed in ecx */ static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, int *edx) @@ -140,24 +132,28 @@ static inline void cpuid_count(int op, i static inline unsigned int cpuid_eax(unsigned int op) { unsigned int eax, ebx, ecx, edx; + eax = op; __cpuid(&eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ebx(unsigned int op) { unsigned int eax, ebx, ecx, edx; + eax = op; __cpuid(&eax, &ebx, &ecx, &edx); return ebx; } static inline unsigned int cpuid_ecx(unsigned int op) { unsigned int eax, ebx, ecx, edx; + eax = op; __cpuid(&eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { unsigned int eax, ebx, ecx, edx; + eax = op; __cpuid(&eax, &ebx, &ecx, &edx); return edx; } diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/paravirt.h linux-2.6.19-paravirt1/include/asm-x86_64/paravirt.h --- linux-2.6.19-paravirt0/include/asm-x86_64/paravirt.h 2007-01-11 21:56:03.000000000 -0200 +++ linux-2.6.19-paravirt1/include/asm-x86_64/paravirt.h 2007-01-11 22:50:41.000000000 -0200 @@ -17,7 +17,8 @@ #define PARAVIRT_SAVE_FLAGS 3 #define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4 #define PARAVIRT_INTERRUPT_RETURN 5 -#define PARAVIRT_STI_SYSRETQ 6 +#define PARAVIRT_SYSRETQ 6 +#define PARAVIRT_SWAPGS 7 /* Bitmask of what can be clobbered: usually at least rax. */ #define CLBR_NONE 0x0 @@ -34,6 +35,11 @@ struct desc_struct; struct tss_struct; struct mm_struct; +struct swapgs { + u64 ret; + void (*fn)(void); +}; + struct paravirt_ops { int paravirt_enabled; @@ -43,6 +49,9 @@ struct paravirt_ops const char *name; + unsigned long *vsyscall_page; + + struct swapgs swapgs; /* * Patch may replace one of the defined code sequences with arbitrary * code, subject to the same register constraints. This generally @@ -89,6 +98,7 @@ struct paravirt_ops void (*restore_fl)(unsigned long); void (*irq_disable)(void); void (*irq_enable)(void); + void (*safe_halt)(void); void (*halt)(void); void (*wbinvd)(void); @@ -98,6 +108,7 @@ struct paravirt_ops int (*write_msr)(unsigned int msr, u64 val); u64 (*read_tsc)(void); + u64 (*read_tscp)(int *aux); u64 (*read_pmc)(void); void (*load_tr_desc)(void); @@ -167,7 +178,7 @@ struct paravirt_ops void (*set_lazy_mode)(int mode); /* These two are jmp to, not actually called. */ - void (*irq_enable_sysexit)(void); + void (*sysret)(void); void (*iret)(void); void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); @@ -262,6 +273,14 @@ static inline void halt(void) val2 = _l >> 32; \ } while(0) +/* rdmsr with exception handling */ +#define rdmsr_safe(msr,a,b) ({ \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; }) + #define wrmsr(msr,val1,val2) do { \ u64 _l = ((u64)(val2) << 32) | (val1); \ paravirt_ops.write_msr((msr), _l); \ @@ -273,19 +292,12 @@ static inline void halt(void) } while(0) #define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val))) + #define wrmsr_safe(msr,a,b) ({ \ u64 _l = ((u64)(b) << 32) | (a); \ paravirt_ops.write_msr((msr),_l); \ }) -/* rdmsr with exception handling */ -#define rdmsr_safe(msr,a,b) ({ \ - int _err; \ - u64 _l = paravirt_ops.read_msr(msr,&_err); \ - (*a) = (u32)_l; \ - (*b) = _l >> 32; \ - _err; }) - #define rdtsc(low,high) do { \ u64 _l = paravirt_ops.read_tsc(); \ low = (u32)_l; \ @@ -299,6 +311,14 @@ static inline void halt(void) #define rdtscll(val) (val = paravirt_ops.read_tsc()) +#define rdtscp(low,high,aux) do { \ + u64 _val = paravirt_ops.read_tscp(&aux); \ + low = (int)_val; \ + high = _val >> 32; \ +} while (0) + +#define rdtscpll(val, aux) (val) = paravirt_ops.read_tscp(&aux) + #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) #define rdpmc(counter,low,high) do { \ @@ -375,7 +395,6 @@ void native_pte_clear(struct mm_struct * void native_pmd_clear(pmd_t *pmd); void native_nop(void); - static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { @@ -483,6 +502,9 @@ struct paravirt_patch { " .short " __stringify(clobber) "\n" \ ".popsection" +/* These functions tends to be very simple. So, if they touch any register, + * the calle-saved ones may already fulfill their needs, and hopefully we + * have no need to save any. */ static inline unsigned long __raw_local_save_flags(void) { unsigned long f; @@ -533,18 +555,12 @@ static inline unsigned long __raw_local_ return f; } +#define CLI_STRING paravirt_alt("call *paravirt_ops+%c[irq_disable];", \ + PARAVIRT_IRQ_DISABLE, CLBR_NONE) +#define STI_STRING paravirt_alt("call *paravirt_ops+%c[irq_enable];", \ + PARAVIRT_IRQ_ENABLE, CLBR_NONE) -/* Still x86-ish */ -#define CLI_STRING paravirt_alt("pushq %%rcx; pushq %%rdx;" \ - "call *paravirt_ops+%c[irq_disable];" \ - "popq %%rdx; popq %%rcx", \ - PARAVIRT_IRQ_DISABLE, CLBR_RAX) - -#define STI_STRING paravirt_alt("pushq %%rcx; pushq %%rdx;" \ - "call *paravirt_ops+%c[irq_enable];" \ - "popq %%rdx; popq %%rcx", \ - PARAVIRT_IRQ_ENABLE, CLBR_RAX) #define CLI_STI_CLOBBERS , "%rax" #define CLI_STI_INPUT_ARGS \ , \ @@ -571,22 +587,23 @@ static inline unsigned long __raw_local_ #define DISABLE_INTERRUPTS(clobbers) \ PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \ - pushq %rcx; pushq %rdx; \ - call *paravirt_ops+PARAVIRT_irq_disable; \ - popq %rdx; popq %rcx) \ + call *paravirt_ops+PARAVIRT_irq_disable) #define ENABLE_INTERRUPTS(clobbers) \ PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \ - pushq %rcx; pushq %rdx; \ - call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ - popq %rdx; popq %rcx) - -#define ENABLE_INTERRUPTS_SYSRETQ \ - PARA_PATCH(PARAVIRT_STI_SYSRETQ, CLBR_ANY, \ - jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) + call *%cs:paravirt_ops+PARAVIRT_irq_enable) -#define GET_CR0_INTO_RAX \ - call *paravirt_ops+PARAVIRT_read_cr0 +#define SYSRETQ \ + PARA_PATCH(PARAVIRT_SYSRETQ, CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_sysret) + +#define SWAPGS \ + movq $. + 0x11, (paravirt_ops+PARAVIRT_swapgs); \ + jmp (paravirt_ops+PARAVIRT_swapgs+8); \ + +/* this is needed in early_idt_handler */ +#define GET_CR2_INTO_RAX \ + call *paravirt_ops+PARAVIRT_read_cr2 #endif /* __ASSEMBLY__ */ #else /* !CONFIG_PARAVIRT */ diff -urp linux-2.6.19-paravirt0/include/asm-x86_64/timex.h linux-2.6.19-paravirt1/include/asm-x86_64/timex.h --- linux-2.6.19-paravirt0/include/asm-x86_64/timex.h 2006-12-11 17:32:53.000000000 -0200 +++ linux-2.6.19-paravirt1/include/asm-x86_64/timex.h 2007-01-10 15:10:00.000000000 -0200 @@ -31,14 +31,29 @@ static __always_inline cycles_t get_cycl { unsigned long long ret; unsigned eax; + unsigned int (*fn)(unsigned int) = &cpuid_eax; /* Don't do an additional sync on CPUs where we know RDTSC is already synchronous. */ - alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC, - "=a" (eax), "0" (1) : "ebx","ecx","edx","memory"); + alternative_io("call *%3", ASM_NOP2, X86_FEATURE_SYNC_RDTSC, + "=a" (eax) , "D" (1) , "m" (fn)); rdtscll(ret); return ret; } +/* Inside a vsyscall, we cannot call paravirt functions. (like rdtsc + * and cpuid). For the host, use this function instead */ +static __always_inline cycles_t vget_cycles_sync(void) +{ + unsigned long ret; + unsigned eax; + /* Don't do an additional sync on CPUs where we know + RDTSC is already synchronous. */ + alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC, + "=a" (eax), "0" (1) : "ebx","ecx","edx","memory"); + + asm volatile("rdtsc" : "=A" (ret)); + return ret; +} extern unsigned int cpu_khz; extern int read_current_timer(unsigned long *timer_value);