[RFC/PATCH LGUEST X86_64 03/13] lguest64 core

Steven Rostedt <rostedt@xxxxxxxxxxx> · Thu, 08 Mar 2007 12:38:47 -0500

plain text document attachment (lguest64.patch)
This is the main core code for the lguest64.

Have fun, and don't hurt the puppies!

Signed-off-by: Steven Rostedt <srostedt@xxxxxxxxxx>
Signed-off-by: Glauber de Oliveira Costa <glommer@xxxxxxxxx>
Cc: Chris Wright <chrisw@xxxxxxxxxxxx>


Index: work-pv/arch/x86_64/lguest/Makefile
===================================================================

--- /dev/null
+++ work-pv/arch/x86_64/lguest/Makefile
@@ -0,0 +1,24 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST)	+= lg.o
+lg-objs := core.o hypervisor.o lguest_user.o hv_vm.o page_tables.o \
+hypercalls.o io.o interrupts_and_traps.o lguest_debug.o
+
+# hypercalls.o page_tables.o interrupts_and_traps.o \
+#	segments.o io.o lguest_user.o
+
+# We use top 4MB for guest traps page, then hypervisor. */
+HYPE_ADDR := (0xFFC00000+4096)
+# The data is only 1k (256 interrupt handler pointers)
+HYPE_DATA_SIZE := 1024
+CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)" -DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
+
+##$(obj)/core.o: $(obj)/hypervisor-blob.c
+### This links the hypervisor in the right place and turns it into a C array.
+##$(obj)/hypervisor-raw: $(obj)/hypervisor.o
+##	@$(LD) -static -Tdata=`printf %#x $$(($(HYPE_ADDR)))` -Ttext=`printf %#x $$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O binary $@
+##$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw
+##	@od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e 's/ /,0x/g' > $@
+
Index: work-pv/arch/x86_64/lguest/core.c
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/core.c
@@ -0,0 +1,379 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <asm/paravirt.h>
+#include <asm/hv_vm.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "lguest.h"
+
+#define HV_OFFSET(x) (typeof(x))((unsigned long)(x)+lguest_hv_offset)
+
+unsigned long lguest_hv_addr;
+unsigned long lguest_hv_offset;
+int lguest_hv_pages;
+
+int lguest_vcpu_pages;
+int lguest_vcpu_order;
+
+DEFINE_MUTEX(lguest_lock);
+
+int lguest_address_ok(const struct lguest_guest_info *linfo, u64 addr)
+{
+	return addr / PAGE_SIZE < linfo->pfn_limit;
+}
+
+u8 lhread_u8(struct lguest_vcpu *vcpu, u64 addr)
+{
+	u8 val = 0;
+
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || get_user(val, (u8 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+	return val;
+}
+
+u16 lhread_u16(struct lguest_vcpu *vcpu, u64 addr)
+{
+	u16 val = 0;
+
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || get_user(val, (u16 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+	return val;
+}
+
+u64 lhread_u64(struct lguest_vcpu *vcpu, u64 addr)
+{
+	u64 val = 0;
+
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || get_user(val, (u64 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+	return val;
+}
+
+void lhwrite_u64(struct lguest_vcpu *vcpu, u64 addr, u64 val)
+{
+	if (!lguest_address_ok(vcpu->guest, addr)
+	    || put_user(val, (u64 __user *)addr) != 0)
+			kill_guest_dump(vcpu, "bad read address %llx", addr);
+}
+
+void lhread(struct lguest_guest_info *linfo, void *b, u64 addr, unsigned bytes)
+{
+	if (addr + bytes < addr || !lguest_address_ok(linfo, addr+bytes)
+	   || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+		/* copy_from_user should do this, but as we rely on it... */
+		memset(b, 0, bytes);
+		kill_guest(linfo, "bad read address %llx len %u", addr, bytes);
+	}
+}
+
+void lhwrite(struct lguest_guest_info *linfo, u64 addr, const void *b,
+								unsigned bytes)
+{
+	if (addr + bytes < addr
+	   || !lguest_address_ok(linfo, addr+bytes)
+	   || copy_to_user((void __user *)addr, b, bytes) != 0)
+		kill_guest(linfo, "bad write address %llx len %u", addr, bytes);
+}
+
+static struct gate_struct *get_idt_table(void)
+{
+	struct desc_ptr idt;
+
+	asm("sidt %0":"=m" (idt));
+	return (void *)idt.address;
+}
+
+static int emulate_insn(struct lguest_vcpu *vcpu)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+	unsigned long physaddr = guest_pa(vcpu->guest, vcpu->regs.rip);
+
+	if (vcpu->regs.rip < vcpu->guest->page_offset)
+		return 0;
+
+	lhread(vcpu->guest, &insn, physaddr, 1);
+
+	/* Operand size prefix means it's actually for ax. */
+	if (insn == 0x66) {
+		shift = 16;
+		insnlen = 1;
+		printk("physaddr + len: %lx\n",physaddr+insnlen);
+		lhread(vcpu->guest, &insn, physaddr + insnlen, 1);
+	}
+
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		printk("%llx: %02x unimplemented op\n", vcpu->regs.rip, insn);
+		kill_guest_dump(vcpu, "bad op");
+		return 0;
+	}
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			vcpu->regs.rax = 0xFFFFFFFF;
+		else
+			vcpu->regs.rax |= (0xFFFF << shift);
+	}
+	vcpu->regs.rip += insnlen;
+	return 1;
+}
+
+#define SAVE_CR2(cr2)	asm volatile ("movq %%cr2, %0" : "=r" (cr2))
+
+static void run_guest_once(struct lguest_vcpu *vcpu)
+{
+	void (*sw_guest)(struct lguest_vcpu *) = HV_OFFSET(&switch_to_guest);
+	unsigned long foo, bar;
+
+	BUG_ON(!vcpu->regs.cr3);
+	BUG_ON(!vcpu->pgdir);
+	BUG_ON(!vcpu->pgdir->pgdir);
+	asm volatile ("pushq %2; pushq %%rsp; pushfq; pushq %3; call *%6;"
+		      /* The stack we pushed is off by 8, due to the previous pushq */
+		      "addq $8, %%rsp"
+		      : "=D"(foo), "=a"(bar)
+		      : "i" (__KERNEL_DS), "i" (__KERNEL_CS), "0" (vcpu), "1"(get_idt_table()),
+			"r" (sw_guest)
+		      : "memory", "cc");
+}
+
+/* FIXME: don't know yet the right parameters to put here */
+int run_guest(struct lguest_vcpu *vcpu, char *__user user)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct desc_struct *gdt_table;
+	struct lguest_regs *regs = &vcpu->regs;
+	int ret;
+
+	unsigned long cr2 = 0;
+
+	while (!linfo->dead) {
+
+		if (regs->trapnum == LGUEST_TRAP_ENTRY) {
+
+			if (lguest_debug) {
+				printk("hit trap %lld rip=", regs->trapnum);
+				lguest_print_address(vcpu, regs->rip);
+				printk("calling hypercall %d!\n", (unsigned)regs->rax);
+			}
+
+			regs->trapnum = 255;
+			hypercall(vcpu);
+			if (linfo->dead)
+				lguest_dump_vcpu_regs(vcpu);
+		}
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		maybe_do_interrupt(vcpu);
+
+		try_to_freeze();
+
+		if (linfo->dead)
+			return -1;
+
+
+		local_irq_disable();
+
+		/*
+		 * keep a pointer to the host GDT tss address.
+		 * Do this after disabling interrupts to make sure we
+		 * are on the same CPU.
+		 */
+		gdt_table = cpu_gdt(smp_processor_id());
+		vcpu->host_gdt_ptr = (unsigned long)gdt_table;
+		asm volatile ("sidt %0" : "=m"(vcpu->host_idt));
+
+		/* Even if *we* don't want FPU trap, guest might... */
+		if (vcpu->ts)
+			stts();
+
+		run_guest_once(vcpu);
+
+		if (regs->trapnum == 14) {
+			SAVE_CR2(cr2);
+			lgdebug_print("faulting cr2: %lx\n",cr2);
+		}
+
+		else if (regs->trapnum == 7)
+			math_state_restore();
+
+		if (lguest_debug && regs->trapnum < 32) {
+			printk("hit trap %lld rip=", regs->trapnum);
+			lguest_print_address(vcpu, regs->rip);
+		}
+
+		local_irq_enable();
+
+		BUG_ON(regs->trapnum > 0xFF);
+
+		switch (regs->trapnum) {
+		case 7:
+			/* We've intercepted a Device Not Available fault. */
+			/* If they don't want to know, just absorb it. */
+			if (!vcpu->ts)
+				continue;
+			if (reflect_trap(vcpu, 7, 1))
+				continue;
+			kill_guest(vcpu->guest, "Unhandled FPU trap at %#llx",
+								regs->rip);
+		case 13:
+			if (!regs->errcode) {
+				ret = emulate_insn(vcpu);
+				if (ret < 0) {
+					lguest_dump_vcpu_regs(vcpu);
+					return ret;
+				}
+				continue;
+			}
+			kill_guest_dump(vcpu, "took gfp errcode %lld\n", regs->errcode);
+			lguest_dump_vcpu_regs(vcpu);
+			break;
+		case 14:
+			if (demand_page(vcpu, cr2, regs->errcode & PF_WRITE))
+				continue;
+
+			if (lguest_debug) {
+				printk ("guest taking a page fault\n");
+				lguest_print_page_tables(vcpu->pgdir->pgdir);
+			}
+
+			/* inform guest on the current state of cr2 */
+			put_user(cr2, &linfo->lguest_data->cr2);
+			if (reflect_trap(vcpu, 14, 1))
+				continue;
+
+			lguest_dump_vcpu_regs(vcpu);
+			kill_guest_dump(vcpu, "unhandled page fault at %#lx"
+					" (rip=%#llx, errcode=%#llx)",
+					cr2, regs->rip, regs->errcode);
+			break;
+		case LGUEST_TRAP_ENTRY:
+			/* hypercall! */
+			continue;
+
+		case 32 ... 255:
+			cond_resched();
+			break;
+		default:
+			kill_guest_dump(vcpu, "bad trapnum %lld\n", regs->trapnum);
+			lguest_dump_vcpu_regs(vcpu);
+			return -EINVAL;
+		}
+	}
+	return -ENOENT;
+}
+
+extern long end_hyper_text;
+extern long start_hyper_text;
+
+static int __init init(void)
+{
+	unsigned long pages;
+	unsigned long hvaddr;
+#if 0
+	unsigned long lg_hcall = (unsigned long)HV_OFFSET(&hcall_teste);
+	unsigned long *lg_host_syscall =
+				(unsigned long *)HV_OFFSET(&host_syscall);
+#endif
+	int order;
+	int ret;
+
+	int i;
+	printk("start_hyper_text=%p\n",&start_hyper_text);
+	printk("end_hyper_text=%p\n",&end_hyper_text);
+	printk("default_idt_entries=%p\n",&_lguest_default_idt_entries);
+	printk("sizeof(vcpu)=%ld\n",sizeof(struct lguest_vcpu));
+
+	pages = (sizeof(struct lguest_vcpu)+(PAGE_SIZE-1))/PAGE_SIZE;
+	for (order = 0; (1<<order) < pages; order++)
+		;
+
+	lguest_vcpu_pages = pages;
+	lguest_vcpu_order = order;
+
+	ret = paravirt_enabled();
+	if (ret < 0)
+		return -EPERM;
+
+	ret = lguest_device_init();
+	if (ret < 0) {
+		return ret;
+	}
+
+	pages = (unsigned long)&end_hyper_text -
+		(unsigned long)&start_hyper_text;
+	pages = (pages + (PAGE_SIZE - 1)) / PAGE_SIZE;
+
+	ret = hvvm_map_pages(&start_hyper_text, pages, &hvaddr);
+	if (ret < 0)
+		goto out;
+	printk("hvaddr=%lx\n",hvaddr);
+
+	lguest_hv_addr = hvaddr;
+	lguest_hv_pages = pages;
+	lguest_hv_offset = hvaddr - (unsigned long)&start_hyper_text;
+
+	/* Setup LGUEST segments on all cpus */
+	for_each_possible_cpu(i) {
+		struct desc_struct *gdt_table;
+		gdt_table = cpu_gdt(i);
+		gdt_table[GDT_ENTRY_HV_CS] = gdt_table[gdt_index(__KERNEL_CS)];
+		gdt_table[GDT_ENTRY_HV_DS] = gdt_table[gdt_index(__KERNEL_DS)];
+	}
+
+//	rdmsrl(MSR_LSTAR, *lg_host_syscall);
+//	wrmsrl(MSR_LSTAR, lg_hcall);
+	return 0;
+#if 0
+	ret = init_pagetables(hvaddr);
+	if (ret < 0)
+		goto out2;
+
+	return 0;
+
+out2:
+	hvvm_unnmap_pages(hvaddr, pages);
+#endif
+out:
+	lguest_device_remove();
+	return ret;
+}
+
+
+static void __exit fini(void)
+{
+#if 0
+	unsigned long *lg_host_syscall =
+			(unsigned long *)HV_OFFSET(&host_syscall);
+
+	wrmsrl(MSR_LSTAR, *lg_host_syscall);
+#endif
+	hvvm_release_all();
+	lguest_device_remove();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
Index: work-pv/arch/x86_64/lguest/hypercalls.c
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/hypercalls.c
@@ -0,0 +1,324 @@
+/*  Actual hypercalls, which allow guests to actually do something.
+    Copyright (C) 2007, Glauber de Oliveira Costa <gcosta@xxxxxxxxxx>
+                        Steven Rostedt <srostedt@xxxxxxxxxx>
+                        Red Hat Inc
+    Standing on the shoulders of Rusty Russell.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <asm/lguest.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/msr.h>
+#include "lguest.h"
+
+/* FIXME: add this to Kconfig */
+#define CONFIG_LGUEST_DEBUG 1
+
+static void guest_set_stack(struct lguest_vcpu *vcpu,
+			    u64 rsp, unsigned int pages)
+{
+	/* You cannot have a stack segment with priv level 0. */
+	if (pages > 2)
+		kill_guest_dump(vcpu, "bad stack pages %u", pages);
+	vcpu->tss.rsp2 = rsp;
+	/* FIXME */
+//	lg->stack_pages = pages;
+//	pin_stack_pages(lg);
+}
+
+static DEFINE_MUTEX(hcall_print_lock);
+#define HCALL_PRINT_SIZ 1024
+static char hcall_print_buf[HCALL_PRINT_SIZ];
+
+/* Return true if DMA to host userspace now pending. */
+static int do_hcall(struct lguest_vcpu *vcpu)
+{
+	struct lguest_regs *regs = &vcpu->regs;
+	struct lguest_guest_info *linfo = vcpu->guest;
+	unsigned long val;
+	unsigned long ret;
+
+	switch (regs->rax) {
+	case LHCALL_PRINT:
+		mutex_lock(&hcall_print_lock);
+		ret = strncpy_from_user(hcall_print_buf,
+					(const char __user *)regs->rdx,
+					HCALL_PRINT_SIZ);
+		if (ret < 0) {
+			kill_guest_dump(vcpu,
+					"bad hcall print pointer (%llx)",
+					regs->rdx);
+			mutex_unlock(&hcall_print_lock);
+			return -EFAULT;
+		}
+		printk("LGUEST: %s", hcall_print_buf);
+		mutex_unlock(&hcall_print_lock);
+
+		break;
+	case LHCALL_FLUSH_ASYNC:
+		break;
+	case LHCALL_LGUEST_INIT:
+		kill_guest_dump(vcpu, "already have lguest_data");
+		break;
+	case LHCALL_RDMSR:
+		switch (regs->rdx) {
+		case MSR_KERNEL_GS_BASE:
+			val = (vcpu->guest_gs_shadow_a & ((1UL << 32)-1)) |
+				(vcpu->guest_gs_shadow_d << 32);
+			lhwrite_u64(vcpu, regs->rbx, val);
+			break;
+		case MSR_GS_BASE:
+			val = (vcpu->guest_gs_a & ((1UL << 32)-1)) |
+				(vcpu->guest_gs_d << 32);
+			lhwrite_u64(vcpu, regs->rbx, val);
+		break;
+		case MSR_FS_BASE:
+			lhwrite_u64(vcpu, regs->rbx, 0);
+		break;
+		case MSR_EFER:
+			val = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+			lhwrite_u64(vcpu, regs->rbx, val);
+		break;
+		default:
+			kill_guest_dump(vcpu, "bad read of msr %llx\n", regs->rdx);
+		}
+		break;
+	case LHCALL_WRMSR:
+		switch (regs->rdx) {
+		case MSR_KERNEL_GS_BASE:
+			if ((regs->rbx >= HVVM_START) &&
+			    (regs->rbx < (HVVM_START + HV_VIRT_SIZE))) {
+				kill_guest_dump(vcpu,
+						"guest trying to set GS shadow base"
+						" in hypervisor");
+				break;
+			}
+			vcpu->guest_gs_shadow_a = regs->rbx;
+			vcpu->guest_gs_shadow_d = regs->rbx >> 32;
+		break;
+		case MSR_GS_BASE:
+			if ((regs->rbx >= HVVM_START) &&
+			    (regs->rbx < (HVVM_START + HV_VIRT_SIZE))) {
+				kill_guest_dump(vcpu,
+						"guest trying to set GS base in hypervisor");
+				break;
+			}
+			vcpu->guest_gs_a = regs->rbx;
+			vcpu->guest_gs_d = regs->rbx >> 32;
+		break;
+		case MSR_FS_BASE:
+			/* always zero */
+		break;
+		default:
+			kill_guest(linfo, "bad write to msr %llx\n", regs->rdx);
+		}
+		break;
+	case LHCALL_SET_PMD:
+		guest_set_pmd(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+	case LHCALL_SET_PUD:
+		guest_set_pud(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+	case LHCALL_SET_PGD:
+		guest_set_pgd(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+	case LHCALL_SET_PTE:
+		guest_set_pte(vcpu, regs->rdx, regs->rbx, regs->rcx);
+		break;
+
+	case LHCALL_FLUSH_TLB_SIG:
+		guest_flush_tlb_single(vcpu, regs->rdx, regs->rbx);
+		break;
+	case LHCALL_FLUSH_TLB:
+		if (regs->rdx)
+			guest_pagetable_clear_all(vcpu);
+		else
+			guest_pagetable_flush_user(vcpu);
+		break;
+
+	case LHCALL_NEW_PGTABLE:
+		guest_new_pagetable(vcpu, regs->rdx);
+		break;
+
+	case LHCALL_CRASH: {
+		char msg[128];
+		lhread(linfo, msg, regs->rdx, sizeof(msg));
+		msg[sizeof(msg)-1] = '\0';
+		kill_guest_dump(vcpu, "CRASH: %s", msg);
+		break;
+	}
+	case LHCALL_LOAD_GDT:
+		/* i386 does a lot of gdt reloads. We don't.
+		 * we may want to support it in the future for more
+		 * strange code paths. Not now */
+		return -ENOSYS;
+
+	case LHCALL_LOAD_IDT_ENTRY: {
+		struct gate_struct g;;
+		if (regs->rdx > 0xFF) {
+			kill_guest(linfo, "There are just 255 idt entries."
+					"What are you trying to do??");
+		}
+		lhread(linfo, &g, regs->rbx, sizeof(g));
+		load_guest_idt_entry(vcpu, regs->rdx,&g);
+		break;
+	}
+	case LHCALL_SET_STACK:
+		guest_set_stack(vcpu, regs->rdx, regs->rbx);
+		break;
+	case LHCALL_TS:
+		vcpu->ts = regs->rdx;
+		break;
+	case LHCALL_TIMER_READ: {
+		u32 now = jiffies;
+		mb();
+		regs->rax = now - linfo->last_timer;
+		linfo->last_timer = now;
+		break;
+	}
+	case LHCALL_TIMER_START:
+		linfo->timer_on = 1;
+		if (regs->rdx != HZ)
+			kill_guest(linfo, "Bad clock speed %lli", regs->rdx);
+		linfo->last_timer = jiffies;
+		break;
+	case LHCALL_HALT:
+		linfo->halted = 1;
+		break;
+	case LHCALL_GET_WALLCLOCK: {
+		struct timeval tv;
+		do_gettimeofday(&tv);
+		regs->rax = tv.tv_sec;
+		break;
+	}
+	case LHCALL_BIND_DMA:
+		printk("Binding dma....\n");
+		regs->rax = bind_dma(linfo, regs->rdx, regs->rbx,
+				     regs->rcx >> 8, regs->rcx & 0xFF);
+		break;
+	case LHCALL_SEND_DMA:
+		printk("Sending dma....\n");
+		return send_dma(linfo, regs->rdx, regs->rbx);
+
+	case LHCALL_IRET:
+		guest_iret(vcpu);
+		break;
+#if 0
+	case LHCALL_LOAD_TLS:
+		guest_load_tls(lg, (struct desc_struct __user*)regs->rdx);
+		break;
+#endif
+
+	case LHCALL_DEBUG_ME:
+#ifdef CONFIG_LGUEST_DEBUG
+		lguest_debug = regs->rdx;
+		printk("lguest debug turned %s\n", regs->rdx ? "on" : "off");
+		lguest_dump_vcpu_regs(vcpu);
+#else
+		{
+			static int once = 1;
+			if (once) {
+				once = 0;
+				printk("lguest debug is disabled, to use this "
+				       "please enable CONFIG_LGUEST_DEBUG\n");
+			}
+		}
+#endif
+		break;
+	default:
+		kill_guest(linfo, "Bad hypercall %lli\n", regs->rax);
+	}
+	return 0;
+}
+
+#if 0
+/* We always do queued calls before actual hypercall. */
+int do_async_hcalls(struct lguest *lg)
+{
+	unsigned int i, pending;
+	u8 st[LHCALL_RING_SIZE];
+
+	if (!lg->lguest_data)
+		return 0;
+
+	if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
+		return -EFAULT;
+
+	for (i = 0; i < ARRAY_SIZE(st); i++) {
+		struct lguest_regs regs;
+		unsigned int n = lg->next_hcall;
+
+		if (st[n] == 0xFF)
+			break;
+
+		if (++lg->next_hcall == LHCALL_RING_SIZE)
+			lg->next_hcall = 0;
+
+		get_user(regs.rax, &lg->lguest_data->hcalls[n].eax);
+		get_user(regs.rdx, &lg->lguest_data->hcalls[n].edx);
+		get_user(regs.rcx, &lg->lguest_data->hcalls[n].ecx);
+		get_user(regs.rbx, &lg->lguest_data->hcalls[n].ebx);
+		pending = do_hcall(lg, &regs);
+		put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+		if (pending)
+			return 1;
+	}
+
+	set_wakeup_process(lg, NULL);
+	return 0;
+}
+#endif
+
+int hypercall(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_regs *regs = &vcpu->regs;
+	int pending;
+
+	if (!linfo->lguest_data) {
+		if (regs->rax != LHCALL_LGUEST_INIT) {
+			kill_guest(linfo, "hypercall %lli before LGUEST_INIT",
+				   regs->rax);
+			return 0;
+		}
+
+		linfo->lguest_data = (struct lguest_data __user *)regs->rdx;
+		/* We check here so we can simply copy_to_user/from_user */
+		if (!lguest_address_ok(linfo, (long)linfo->lguest_data)
+		    || !lguest_address_ok(linfo, (long)(linfo->lguest_data+1))){
+			kill_guest(linfo, "bad guest page %p", linfo->lguest_data);
+			return 0;
+		}
+		/* update the page_offset info */
+		get_user(linfo->page_offset, &linfo->lguest_data->page_offset);
+		get_user(linfo->start_kernel_map, &linfo->lguest_data->start_kernel_map);
+
+#if 0
+		get_user(linfo->noirq_start, &linfo->lguest_data->noirq_start);
+		get_user(linfo->noirq_end, &linfo->lguest_data->noirq_end);
+#endif
+		/* We reserve the top pgd entry. */
+		put_user(4U*1024*1024, &linfo->lguest_data->reserve_mem);
+		put_user(linfo->guest_id, &linfo->lguest_data->guest_id);
+		return 0;
+	}
+	pending = do_hcall(vcpu);
+	//set_wakeup_process(vcpu, NULL);
+	return pending;
+}
Index: work-pv/arch/x86_64/lguest/hypervisor.S
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/hypervisor.S
@@ -0,0 +1,711 @@
+#include <asm/asm-offsets.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/segment.h>
+#include "lguest.h"
+
+.text
+.align PAGE_SIZE
+
+.global start_hyper_text
+	.type start_hyper_text, @function
+start_hyper_text:
+
+.global	host_syscall
+host_syscall:
+	.quad 0
+
+#define PRINT_L(L)				\
+        PRINT_OUT($L)
+
+#define PRINT_N(n)				\
+        PRINT_OUT($'0' + $n)
+
+#define PRINT_HEX(n)				\
+	mov     n, %cl;				\
+	and     $0xf, %cl;			\
+	cmp     $0xa, %cl;			\
+	jge     11f;				\
+	add     $'0', %cl;			\
+	jmp     12f;				\
+11:	add     $('a' - 10), %cl;               \
+12:	PRINT_OUT(%cl);
+
+#define PRINT_NUM_BX				\
+9:	PRINT_HEX(%bl);				\
+	shr     $4, %rbx;			\
+	jne     9b
+
+#define PRINT_NUM(n)				\
+	movl    $n, %ebx;			\
+	PRINT_NUM_BX;				\
+	PRINT_L('\n');				\
+	PRINT_L('\r')
+
+#define PRINT_LONG(n)				\
+	movl    n, %ebx;			\
+	PRINT_NUM_BX;				\
+	PRINT_L('\n');				\
+	PRINT_L('\r')
+
+#define PRINT_QUAD(n)				\
+	movq    n, %rbx;			\
+	PRINT_NUM_BX;				\
+	PRINT_L('\n');				\
+	PRINT_L('\r')
+
+#define PRINT_X					\
+	PRINT_L('x')
+
+#define PRINT_OUT(x)				\
+	mov $0x3f8, %esi;			\
+21:	lea  0x5(%esi), %edx;			\
+	movzwl %dx, %edx;			\
+	in  (%dx), %al;				\
+	test $0x20,%al;				\
+	jne 22f;				\
+	pause;					\
+	jmp 21b;				\
+22:						\
+	movl    %esi, %edx;			\
+	movzwl  %dx, %edx;			\
+	mov     x, %al;				\
+	out     %al, (%dx);			\
+31:						\
+	lea  0x5(%esi), %edx;			\
+	movzwl %dx, %edx;			\
+	in  (%dx), %al;				\
+	test $0x20,%al;				\
+	jne 32f;				\
+	pause;					\
+	jmp 31b;				\
+32:						\
+
+#define PUSH_NUM				\
+	pushq %rcx;				\
+	pushq %rbx;
+
+#define POP_NUM					\
+	pushq %rbx;				\
+	pushq %rcx;
+
+#define PUSH_PRINT				\
+	pushq %rsi;				\
+	pushq %rdx;				\
+	pushq %rax;				\
+
+#define POP_PRINT				\
+	popq %rax;				\
+	popq %rdx;				\
+	popq %rsi;
+
+#define S_PRINT_NUM(_n)				\
+	PUSH_PRINT;				\
+	PUSH_NUM;				\
+	PRINT_NUM(_n);				\
+	POP_NUM;				\
+	POP_PRINT;
+
+#define S_PRINT_L(x)				\
+	PUSH_PRINT;				\
+	PRINT_L(x);				\
+	POP_PRINT;
+
+#define S_PRINT_QUAD(_n)			\
+	PUSH_PRINT;				\
+	PUSH_NUM;				\
+	PRINT_QUAD(_n);				\
+	POP_NUM;				\
+	POP_PRINT;
+
+/* Save registers on the current stack. Both for
+ * switch_to_guest and switch_to_host usage */
+#define SAVE_REGS				\
+	/* Save old guest/host state */		\
+	pushq	%fs;				\
+	pushq	%rax;				\
+	pushq	%r15;				\
+	pushq	%r14;				\
+	pushq	%r13;				\
+	pushq	%r12;				\
+	pushq	%r11;				\
+	pushq	%r10;				\
+	pushq	%r9;				\
+	pushq	%r8;				\
+	pushq	%rbp;				\
+	pushq	%rdi;				\
+	pushq	%rsi;				\
+	pushq	%rdx;				\
+	pushq	%rcx;				\
+	pushq	%rbx;				\
+
+#define RESTORE_REGS				\
+	/* Save old guest/host state */		\
+	popq	%rbx;				\
+	popq	%rcx;				\
+	popq	%rdx;				\
+	popq	%rsi;				\
+	popq	%rdi;				\
+	popq	%rbp;				\
+	popq	%r8;				\
+	popq	%r9;				\
+	popq	%r10;				\
+	popq	%r11;				\
+	popq	%r12;				\
+	popq	%r13;				\
+	popq	%r14;				\
+	popq	%r15;				\
+	popq	%rax;				\
+	popq	%fs;				\
+
+.macro dump_stack_regs PREFIX
+	movq	$LGUEST_REGS_size, %r10
+	xorq	%r11, %r11
+1:	PRINT_L(\PREFIX);
+	movq	%r11, %rbx;
+	PRINT_NUM_BX;
+	PRINT_L(':'); PRINT_L(' ');
+	movq	%rsp, %r9
+	addq	%r11, %r9
+	PRINT_QUAD((%r9))
+	addq	$8, %r11
+	cmp	%r11, %r10
+	ja	1b
+.endm
+
+.macro debugme VCPU C
+	testb	$1,LGUEST_VCPU_debug(\VCPU)
+	jz	23f
+	PRINT_L(\C)
+23:
+.endm
+
+
+#if 0
+.global hcall_teste
+	.type hcall_teste, @function
+hcall_teste:
+	cmpq	$0, %gs:pda_vcpu
+	jne	handle_guest
+	jmp	*host_syscall
+handle_guest:
+	/* SAVE_REGS  maybe it is not the macro we want */
+	#cmpq	$__PAGE_OFFSET, %rcx;
+	jb	do_hypercall
+	movq	%gs:pda_vcpu, %rcx;
+	movq	LGUEST_VCPU_guest_syscall(%rcx), %rcx;
+#endif
+
+/**
+ * DECODE_IDT  parse a IDT descriptor to find the target.
+ *  @IDT     - The register that holds the IDT descriptor location
+ *  @IDTWORD - The word version of the IDT register
+ *	        (ie. IDT is %rax, then IDTWORD must be %ax)
+ *  @RESULT  - The regsiter to place the result.
+ *
+ * This clobbers both IDT and RESULT regs.
+ */
+.macro DECODE_IDT IDT IDTWORD RESULT
+	movzwq	(\IDT), \RESULT
+	movq	4(\IDT), \IDT
+	xorw	\IDTWORD, \IDTWORD
+	orq	\IDT, \RESULT
+.endm
+
+/**
+ * DECODE_SSEG  parse a System Segment descriptor to find the target.
+ *  @SEG       - The register that holds the Sys Seg descriptor location
+ *  @RESULT    - The regsiter to place the result.
+ *  @RW	       - The word version of the RESULT register
+ *  @RH	       - The high byte version of the RESULT register
+ *
+ * (ie. RESULT is %rax, then RW must be %ax and RH must be %ah)
+ *
+ * This clobbers both SEG and RESULT regs.
+ */
+/* Why does Intel need to make everything so darn complex! */
+.macro DECODE_SSEG SEG RESULT RW RH
+	movzbq	7(\SEG), \RESULT
+	shl	$16, \RESULT
+	movb	4(\SEG), \RH
+	shl	$8, \RESULT
+	movw	2(\SEG), \RW
+	movq	8(\SEG), \SEG
+	shlq	$32, \SEG
+	orq	\SEG, \RESULT
+.endm
+
+.global switch_to_guest
+	.type switch_to_guest, @function
+/* rdi holds the pointer to vcpu.
+ * Interrupts are off on entry   */
+switch_to_guest:
+	SAVE_REGS
+	/* save host stack */
+	movq	%rsp, LGUEST_VCPU_host_stack(%rdi)
+	/* put the guest's stack in */
+	movq	%rdi, %rsp
+	/* move the stack to point to guest regs */
+	addq	$LGUEST_VCPU_regs, %rsp
+	/* filling this pointer has the effect of signalizing we're
+	 * running guest code */
+	movq	%rdi, %gs:pda_vcpu
+
+	/* save this host's gdt and idt */
+	sgdt LGUEST_VCPU_host_gdt(%rdi)
+	sidt LGUEST_VCPU_host_idt(%rdi)
+
+	/* Save the gs base of the host (for nmi use) */
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_host_gs_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_host_gs_d(%rdi)
+
+	/* Save the host proc gs pointer */
+	movl	$MSR_KERNEL_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_host_proc_gs_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_host_proc_gs_d(%rdi)
+
+	/* save the hosts page tables */
+	movq %cr3, %rax
+	movq %rax, LGUEST_VCPU_host_cr3(%rdi)
+
+	/*
+	 * The NMI is a big PITA. There's no way to atomically load the
+	 * TSS and IDT, so we can't just switch to the guest TSS without
+	 * causing a race condition with  the NMI.
+	 * So we set up the host NMI stack in the guest TSS IST so that
+	 * in case we take an NMI after loading our TR register
+	 * but before we've updated the lidt, we still have a valid
+	 * stack for the host nmi handler to use.
+	 */
+	/* Load the guest gdt */
+	lgdt LGUEST_VCPU_gdt(%rdi)
+
+	/* Switch to guest's TSS (before loading the idt) */
+	movl	$(GDT_ENTRY_TSS*8), %ebx
+	ltr	%bx
+
+	/* Set host's TSS to available (clear byte 5 bit 2). */
+	movq	LGUEST_VCPU_host_gdt_ptr(%rdi), %rax
+	andb	$0xFD, (GDT_ENTRY_TSS*8+5)(%rax)
+
+	/* Now load the guest idt */
+	lidt LGUEST_VCPU_idt(%rdi)
+
+	/* Load the guest gs pointer */
+	movl	$MSR_KERNEL_GS_BASE, %ecx
+	movq	LGUEST_VCPU_guest_gs_a(%rdi), %rax
+	movq	LGUEST_VCPU_guest_gs_d(%rdi), %rdx
+	wrmsr
+
+	/* Flush the TLB */
+	movq	%cr4, %rax
+	movq	%rax, %rbx
+	andb	$~(1<<7), %al
+	movq	%rax, %cr4
+	movq	%rbx, %cr4
+
+	/* switch to the guests page tables */
+	popq %rax
+	movq %rax, %cr3
+
+	/* Now we swap gs to the guest gs base */
+	swapgs
+
+	/* restore guest registers */
+	RESTORE_REGS
+	/* skip trapnum and errorcode */
+	addq	$0x10, %rsp;
+	iretq
+
+.macro print_trap VCPU REG
+	movq	LGUEST_VCPU_trapnum(\VCPU), \REG
+	PRINT_QUAD(\REG)
+.endm
+
+#define SWITCH_TO_HOST							\
+	SAVE_REGS;							\
+	/* Save old pgdir */						\
+	movq	%cr3, %rax;						\
+	pushq	%rax;							\
+	/* Point rdi to the vcpu struct */				\
+	movq	%rsp, %rdi;						\
+	subq	$LGUEST_VCPU_regs, %rdi;				\
+	/* Load lguest ds segment for convenience. */			\
+	movq	$(__HV_DS), %rax;					\
+	movq	%rax, %ds;						\
+	/* Load the host page tables since that's where the gdt is */	\
+	movq    LGUEST_VCPU_host_cr3(%rdi), %rax;			\
+	movq    %rax, %cr3;						\
+	/* Switch to hosts gdt */					\
+	lgdt    LGUEST_VCPU_host_gdt(%rdi);				\
+	/* Set guest's TSS to available (clear byte 5 bit 2). */	\
+	movq    LGUEST_VCPU_vcpu(%rdi), %rax;				\
+	andb	$0xFD, (LGUEST_VCPU_gdt_table+GDT_ENTRY_TSS*8+5)(%rax);	\
+	/* Swap back to the host PDA */					\
+	swapgs;								\
+	/* Put back the host process gs as well */			\
+	movl  	$MSR_KERNEL_GS_BASE,%ecx;				\
+	movq    LGUEST_VCPU_host_proc_gs_a(%rdi), %rax;			\
+	movq    LGUEST_VCPU_host_proc_gs_d(%rdi), %rdx;			\
+	wrmsr;								\
+	/* With PDA back now switch to host idt */			\
+	lidt    LGUEST_VCPU_host_idt(%rdi);				\
+	/* Switch to host's TSS. */					\
+	movl	$(GDT_ENTRY_TSS*8), %eax;				\
+	ltr	%ax;							\
+	/* put flag down. We're in the host again */			\
+	movq	$0, %gs:pda_vcpu;					\
+	movq	LGUEST_VCPU_host_stack(%rdi), %rsp;			\
+	RESTORE_REGS;
+
+/* Return to run_guest_once. */
+return_to_host:
+	SWITCH_TO_HOST
+	iretq
+
+deliver_to_host:
+	SWITCH_TO_HOST
+decode_idt_and_jmp:
+	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
+	 * will return to run_guest_once.  This is a feature. */
+	/* We told gcc we'd clobber rdi and rax... */
+	movq	LGUEST_VCPU_trapnum(%rdi), %rdi
+	shl	$1, %rdi
+	leaq	(%rax,%rdi,8), %rdi
+	DECODE_IDT %rdi %di %rax
+	jmp	*%rax
+
+#define NMI_SWITCH_TO_HOST						\
+	/* Force switch to host, GDT, CR3, and both GS bases */		\
+	movl    $MSR_GS_BASE, %ecx;					\
+	movq    LGUEST_VCPU_host_gs_a(%rdi), %rax;			\
+	movq    LGUEST_VCPU_host_gs_d(%rdi), %rdx;			\
+	wrmsr;								\
+	movl    $MSR_KERNEL_GS_BASE, %ecx;				\
+	movq    LGUEST_VCPU_host_proc_gs_a(%rdi), %rax;			\
+	movq    LGUEST_VCPU_host_proc_gs_d(%rdi), %rdx;			\
+	wrmsr;								\
+	movq    LGUEST_VCPU_host_cr3(%rdi), %rax;			\
+	movq	%rax, %cr3;						\
+	lgdt    LGUEST_VCPU_host_gdt(%rdi);
+
+#if 0
+	/* Set host's TSS to available (clear byte 5 bit 2). */		\
+	movq	LGUEST_VCPU_host_gdt_ptr(%rdi), %rax;			\
+	andb	$0xFD, (GDT_ENTRY_TSS*8+5)(%rax);			\
+
+#endif
+
+/* Used by NMI only */
+/*
+ * The NMI is special because it uses its own stack, and needs to
+ * find the vcpu struct differently.
+ */
+nmi_trampoline:
+	/* nmi has it's own stack */
+	SAVE_REGS
+
+	/* save the cr3 */
+	movq     %cr3, %rax
+	pushq	 %rax
+
+	/* get the vcpu struct */
+	movq     %rsp, %rdi
+	subq     $LGUEST_VCPU_nmi_stack_end, %rdi
+	addq     $LGUEST_REGS_size, %rdi  /* compensate for saved regs */
+
+	/* compensate if our end pointer is not 16 bytes aligned */
+	movq	 $LGUEST_VCPU_nmi_stack_end, %rax
+	andq	 $0xf, %rax;
+	addq	 %rax, %rdi;
+
+#if 0 /* in case we want to see where the nmi hit */
+	movq	LGUEST_REGS_rip(%rsp), %r8
+	PRINT_L('R')
+	PRINT_QUAD(%r8)
+#endif
+
+	/*
+	 * All guest descriptors are above the HV text code (here!)
+	 * If we hit the suspected NMI race, our stack will be the host
+	 * kernel stack, and that is in lower address space than the HV.
+	 * So test to see if we are screwed. Don't do anything, but just
+	 * report it!
+	 */
+	call   1f
+1:
+	movq	0(%rsp), %rax /* put this RIP into rax */
+	/* If rsp >= rax; jmp */
+	cmpq	%rax, %rsp
+	jge	1f
+
+	PRINT_L('H'); PRINT_L('i'); PRINT_L('t'); PRINT_L(' ');
+	PRINT_L('N'); PRINT_L('M'); PRINT_L('I'); PRINT_L(' ');
+	PRINT_L('r'); PRINT_L('a'); PRINT_L('c');
+	PRINT_L('\n'); PRINT_L('\r');
+
+1:
+	/* put back the stack from the previous call */
+	addq   $8, %rsp
+
+	/*
+	 * If we take another NMI while saving, we need to start over
+	 * and try again. It's OK as long as we don't overwrite
+	 * the saved material.
+	 */
+	testq    $1,LGUEST_VCPU_nmi_sw(%rdi)
+	jnz      1f
+
+	/* Copy the saved regs */
+	cld
+	movq	%rdi,  %rbx   /* save off vcpu struct */
+	leaq	LGUEST_VCPU_nmi_regs(%rdi), %rdi
+	leaq	0(%rsp), %rsi
+	movq	$(LGUEST_REGS_size/8), %rcx
+	rep	movsq
+
+	movq	%rbx, %rdi  /* put back vcpu struct */
+
+	/* save the gs base and shadow */
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_nmi_gs_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_nmi_gs_d(%rdi)
+
+	movl	$MSR_KERNEL_GS_BASE, %ecx
+	rdmsr
+	movq	%rax, LGUEST_VCPU_nmi_gs_shadow_a(%rdi)
+	movq	%rdx, LGUEST_VCPU_nmi_gs_shadow_d(%rdi)
+
+	/* save the gdt */
+	sgdt	LGUEST_VCPU_nmi_gdt(%rdi)
+
+	/* set the switch flag to prevent another nmi from saving over this */
+	movq   $1, LGUEST_VCPU_nmi_sw(%rdi)
+
+1:
+
+#if 0
+	S_PRINT_L('N')
+	S_PRINT_L('M')
+	S_PRINT_L('I')
+	S_PRINT_L(' ')
+	S_PRINT_L('l')
+	S_PRINT_L('g')
+	S_PRINT_L('u')
+	S_PRINT_L('e')
+	S_PRINT_L('s')
+	S_PRINT_L('t')
+	S_PRINT_L('\n')
+	S_PRINT_L('\r')
+#endif
+	NMI_SWITCH_TO_HOST
+
+	/* we want to come back here on the iret */
+	pushq  $__HV_DS
+	/* put the vcpu struct as our stack */
+	pushq %rdi
+	pushfq
+	pushq	$__HV_CS
+
+	movq    LGUEST_VCPU_host_idt_address(%rdi), %rax
+
+	/* Decode the location of the host NMI handler */
+	leaq   32(%rax), %rbx   /* NMI IDT entry */
+	DECODE_IDT %rbx %bx %rax
+
+	callq   *%rax
+
+	/*
+	 * Back from NMI, stack points to vcpu, and we can take
+	 * more NMIs at this point. That's OK, since we only
+	 * want to get to the original NMI interruption. We
+	 * just restart this restore process. Nested NMIs will
+	 * not destroy this data while the nmi_sw flag is set.
+	 */
+	movq    %rsp, %rdi
+
+	/* restore the cr3 */
+	addq   $(LGUEST_VCPU_nmi_regs), %rsp
+	popq   %rax
+	movq   %rax, %cr3
+
+	/* restore the gdt */
+	lgdt	LGUEST_VCPU_nmi_gdt(%rdi)
+
+#if 0 /* print magic */
+	movq	LGUEST_VCPU_magic(%rdi), %r8
+	movq	$(6*8), %r9
+1:	subq	$8, %r9
+	movq	%r9, %rcx
+	movq	%r8, %rbx
+	shr	%cl, %rbx
+	PRINT_OUT(%bl)
+	cmp	$0, %r9
+	jne	1b
+#endif
+
+	/* make both host and guest TSS available */
+#if 1
+	movq	LGUEST_VCPU_host_gdt_ptr(%rdi), %rax
+	andb	$0xFD, (GDT_ENTRY_TSS*8+5)(%rax)
+
+	andb	$0xFD, (LGUEST_VCPU_gdt_table+GDT_ENTRY_TSS*8+5)(%rdi)
+#endif
+
+#if 0
+	movl	$(GDT_ENTRY_TSS*8), %ebx
+	ltr	%bx
+#endif
+
+	/* restore the gs base and shadow */
+	movl   $MSR_GS_BASE, %ecx
+	movq   LGUEST_VCPU_nmi_gs_a(%rdi), %rax
+	movq   LGUEST_VCPU_nmi_gs_d(%rdi), %rdx
+	wrmsr
+
+	movl   $MSR_KERNEL_GS_BASE, %ecx
+	movq   LGUEST_VCPU_nmi_gs_shadow_a(%rdi), %rax
+	movq   LGUEST_VCPU_nmi_gs_shadow_d(%rdi), %rdx
+	wrmsr
+
+#if 0
+	PRINT_L('O')
+	PRINT_L('U')
+	PRINT_L('T')
+	PRINT_L('\n')
+	PRINT_L('\r')
+#endif
+
+#if 1
+	/* Flush the TLB */
+	movq	%cr4, %rax
+	movq	%rax, %rbx
+	andb	$~(1<<7), %al
+	movq	%rax, %cr4
+	movq	%rbx, %cr4
+#endif
+
+	RESTORE_REGS
+
+	/* skip trapnum and errcode */
+	addq	$0x10, %rsp
+
+	/*
+	 * Careful here, we can't modify any regs anymore
+	 * but we now have to zero out the nmi switch flag.
+	 * So all the work will be done by the stack pointer.
+	 */
+
+#define SW_OFFSET (LGUEST_VCPU_nmi_sw - \
+		   (LGUEST_VCPU_nmi_regs + LGUEST_REGS_rip))
+	 movq  $0, SW_OFFSET(%rsp)
+
+	 /* use iret to get back to where we were. */
+	 iretq;
+	 /* Whoo, all done! */
+
+do_crash:
+	SAVE_REGS
+	movq	%cr3, %rax;
+	pushq	%rax;
+	PRINT_L('C');PRINT_L('r');PRINT_L('a');PRINT_L('s');
+	PRINT_L('h');PRINT_L('i');PRINT_L('n');PRINT_L('g');
+	PRINT_L('\n');PRINT_L('\r');
+
+	dump_stack_regs 'S'
+
+	addq	$16, %rsp
+	sgdt	0(%rsp)
+	PRINT_L('G');PRINT_L('D');PRINT_L('T');PRINT_L('L');PRINT_L(':');PRINT_L(' ');
+	xorq	%r8, %r8
+	movw	(%rsp), %r8
+	PRINT_QUAD(%r8)
+	PRINT_L('G');PRINT_L('D');PRINT_L('T');PRINT_L('A');PRINT_L(':');PRINT_L(' ');
+	movq	2(%rsp), %r8
+	PRINT_QUAD(%r8)
+
+	PRINT_L('C');PRINT_L('S');PRINT_L(':');PRINT_L(' ');
+	movq	%cs, %rbx
+	PRINT_QUAD(%rbx)
+	movq	%cs, %rbx
+	andb	$(~3), %bl
+	addq	%rbx, %r8
+	movq	0(%r8), %r9
+	PRINT_L('S');PRINT_L('E');PRINT_L('G');PRINT_L(':');PRINT_L(' ');
+	PRINT_QUAD(%r9);
+	movq	$1, %r8;
+	shl	$47, %r8
+	andq	%r9, %r8
+	PRINT_L('P');PRINT_L(' ');PRINT_L(':');PRINT_L(' ');
+	PRINT_QUAD(%r8);
+	PRINT_L('D');PRINT_L('P');PRINT_L(':');PRINT_L(' ');
+	movq	$3, %r8;
+	shl	$45, %r8
+	andq	%r9, %r8
+	PRINT_QUAD(%r8);
+
+
+	/* just die! */
+2:
+	pause
+	jmp 2b
+
+
+/* Real hardware interrupts are delivered straight to the host.  Others
+   cause us to return to run_guest_once so it can decide what to do.  Note
+   that some of these are overridden by the guest to deliver directly, and
+   never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+	.data; .quad 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+/*  .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) */
+  .if (\N < 10 || \N > 14) && (\N <> 17)
+	pushq	$0
+ .endif
+	pushq	$\N
+	jmp	\TARGET
+	.align 8
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+	IRQ_STUB irq \TARGET
+  irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host.  Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+irq_stubs:
+.data
+.global _lguest_default_idt_entries
+_lguest_default_idt_entries:
+.text
+	IRQ_STUBS 0 1 return_to_host		/* First two traps */
+	IRQ_STUB 2 nmi_trampoline	/* NMI */
+	IRQ_STUBS 3 7 return_to_host		/* Rest of traps */
+/*debug for now */
+	IRQ_STUB 8 do_crash			/* Double fault! */
+#if 1
+	IRQ_STUBS 9 31 return_to_host		/* Rest of traps */
+#else
+	IRQ_STUBS 9 12 return_to_host		/* Rest of traps */
+	IRQ_STUB 13 do_crash			/* GPF! */
+	IRQ_STUBS 14 31 return_to_host		/* Rest of traps */
+#endif
+	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
+	IRQ_STUB 128 return_to_host		/* System call (overridden) */
+	IRQ_STUBS 129 255 deliver_to_host	/* Other real interrupts */
+
+	.align PAGE_SIZE
+.global end_hyper_text
+	.type end_hyper_text, @function
+end_hyper_text:
+	nop
Index: work-pv/arch/x86_64/lguest/interrupts_and_traps.c
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/interrupts_and_traps.c
@@ -0,0 +1,292 @@
+#include <linux/uaccess.h>
+#include <asm/lguest.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include "lguest.h"
+
+static void push_guest_stack(struct lguest_vcpu *vcpu,
+					u64 __user **gstack, u64 val)
+{
+	lhwrite_u64(vcpu, (u64)--(*gstack), val);
+}
+
+static u64 pop_guest_stack(struct lguest_vcpu *vcpu,
+			   u64 __user **gstack)
+{
+	return lhread_u64(vcpu, (u64)(*gstack)++);
+}
+
+void guest_iret(struct lguest_vcpu *vcpu)
+{
+	struct lguest_regs *regs = &vcpu->regs;
+	u64 __user *gstack;
+	u64 cs;
+
+	gstack = (u64 __user *)guest_pa(vcpu->guest, regs->rsp);
+
+	regs->rip = pop_guest_stack(vcpu, &gstack);
+	cs = pop_guest_stack(vcpu, &gstack);
+
+	/* FIXME: determine if we are going back to userland */
+
+	regs->rflags = pop_guest_stack(vcpu, &gstack);
+	/* FIXME: check if this is correct */
+
+	if (regs->rflags & 512)
+		put_user(512, &vcpu->guest->lguest_data->irq_enabled);
+
+	/* make sure interrupts are enabled */
+	regs->rflags |= 512;
+
+	regs->rsp = pop_guest_stack(vcpu, &gstack);
+	regs->ss = pop_guest_stack(vcpu, &gstack);
+
+	/* restore the rax reg, since it was used by the guest to do the hcall */
+	regs->rax = vcpu->rax;
+
+	return;
+}
+
+int reflect_trap(struct lguest_vcpu *vcpu, int trap_num, int has_err)
+{
+	struct lguest_regs *regs = &vcpu->regs;
+	u64 __user *gstack;
+	u64 rflags, irq_enable;
+	u64 offset;
+
+	if (!vcpu->interrupt[trap_num]) {
+		printk("Not yet registered trap handler for %d\n",trap_num);
+		return 0;
+	}
+
+	/* save off the rax reg */
+	vcpu->rax = regs->rax;
+
+	/* FIXME: test for ring change and set up vcpu->tss.rsp2 ? */
+	gstack = (u64 __user *)guest_pa(vcpu->guest, regs->rsp);
+	offset = regs->rsp - (u64)gstack;
+
+	/* We use IF bit in eflags to indicate whether irqs were disabled
+	   (it's always 0, since irqs are enabled when guest is running). */
+	get_user(irq_enable, &vcpu->guest->lguest_data->irq_enabled);
+	rflags = regs->rflags;
+	rflags |= (irq_enable & 512);
+
+	/* FIXME: Really? */
+	push_guest_stack(vcpu, &gstack, regs->ss);
+	push_guest_stack(vcpu, &gstack, regs->rsp);
+	push_guest_stack(vcpu, &gstack, rflags);
+	/* FIXME: determine if guest is in kernel or user mode */
+	push_guest_stack(vcpu, &gstack, __KERNEL_CS);
+	push_guest_stack(vcpu, &gstack, regs->rip);
+
+	if (has_err)
+		push_guest_stack(vcpu, &gstack, regs->errcode);
+
+	/* Change the real stack so hypervisor returns to trap handler */
+	regs->ss = __USER_DS;
+	regs->rsp = (u64)gstack + offset;
+	regs->cs = __USER_CS;
+	lgdebug_print("rip was at %p\n", (void*)regs->rip);
+	regs->rip = vcpu->interrupt[trap_num];
+
+	/* Disable interrupts for an interrupt gate. */
+	if (test_bit(trap_num, vcpu->interrupt_disabled))
+		put_user(0, &vcpu->guest->lguest_data->irq_enabled);
+	return 1;
+#if 0
+	/* Was ist da? */
+	/* GS will be neutered on way back to guest. */
+	put_user(0, &lg->lguest_data->gs_gpf_eip);
+#endif
+	return 0;
+}
+
+void maybe_do_interrupt(struct lguest_vcpu *vcpu)
+{
+	unsigned int irq;
+	DECLARE_BITMAP(irqs, LGUEST_IRQS);
+
+	if (!vcpu->guest->lguest_data)
+		return;
+
+	/* If timer has changed, set timer interrupt. */
+	if (vcpu->guest->timer_on && jiffies != vcpu->guest->last_timer)
+		set_bit(0, vcpu->irqs_pending);
+
+	/* Mask out any interrupts they have blocked. */
+	if (copy_from_user(&irqs, vcpu->guest->lguest_data->interrupts,
+								sizeof(irqs)))
+		return;
+
+	bitmap_andnot(irqs, vcpu->irqs_pending, irqs, LGUEST_IRQS);
+
+	irq = find_first_bit(irqs, LGUEST_IRQS);
+	if (irq >= LGUEST_IRQS)
+		return;
+
+	/* If they're halted, we re-enable interrupts. */
+	if (vcpu->guest->halted) {
+		/* Re-enable interrupts. */
+		put_user(512, &vcpu->guest->lguest_data->irq_enabled);
+		vcpu->guest->halted = 0;
+	} else {
+		/* Maybe they have interrupts disabled? */
+		u32 irq_enabled;
+		get_user(irq_enabled, &vcpu->guest->lguest_data->irq_enabled);
+		if (!irq_enabled) {
+			lgdebug_print("Irqs are disabled\n");
+			return;
+		}
+	}
+
+	if (vcpu->interrupt[irq + FIRST_EXTERNAL_VECTOR] != 0) {
+		lgdebug_print("Reflect trap: %x\n",irq+FIRST_EXTERNAL_VECTOR);
+		clear_bit(irq, vcpu->irqs_pending);
+		reflect_trap(vcpu, irq+FIRST_EXTERNAL_VECTOR, 0);
+	}
+	else {
+		lgdebug_print("out without doing it!!\n");
+	}
+
+}
+
+void check_bug_kill(struct lguest_vcpu *vcpu)
+{
+/* FIXME: Use rostedt magic kallsyms */
+#if 0
+#ifdef CONFIG_BUG
+	u32 eip = lg->state->regs.rip - PAGE_OFFSET;
+	u16 insn;
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->state->regs.rip < PAGE_OFFSET)
+		return;
+	lhread(lg, &insn, eip, sizeof(insn));
+	if (insn == 0x0b0f) {
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+		u16 l;
+		u32 f;
+		char file[128];
+		lhread(lg, &l, eip+sizeof(insn), sizeof(l));
+		lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f));
+		lhread(lg, file, f - PAGE_OFFSET, sizeof(file));
+		file[sizeof(file)-1] = 0;
+		kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l);
+#else
+		kill_guest(lg, "BUG() at %#x", eip);
+#endif	/* CONFIG_DEBUG_BUGVERBOSE */
+	}
+#endif	/* CONFIG_BUG */
+#endif
+}
+
+static void copy_trap(struct lguest_vcpu *vcpu,
+		      unsigned int trap_num,
+		      const struct gate_struct *desc)
+{
+
+	/* Not present? */
+	if (!desc->p) {
+		vcpu->interrupt[trap_num] = 0;
+		return;
+	}
+
+	switch (desc->type) {
+		case 0xE:
+			set_bit(trap_num,vcpu->interrupt_disabled);
+			break;
+		case 0xF:
+			clear_bit(trap_num,vcpu->interrupt_disabled);
+			break;
+		default:
+			kill_guest(vcpu->guest, "bad IDT type %i for irq %x",
+				desc->type,trap_num);
+	}
+
+	vcpu->interrupt[trap_num] = GATE_ADDRESS((*desc));
+}
+
+#if 0
+
+/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
+static u8 tramp[]
+= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
+    0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+    /* movl 0, %ss:lguest_data.gs_gpf_eip */
+    0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
+};
+#define TRAMP_MOVL_TARGET_OFF 7
+#define TRAMP_JMP_TARGET_OFF 16
+
+static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
+{
+	u32 addr, off;
+
+	off = sizeof(tramp)*i;
+	memcpy(lg->trap_page + off, tramp, sizeof(tramp));
+
+	/* 0 is to be placed in lguest_data.gs_gpf_eip. */
+	addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
+	memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
+
+	/* Address is relative to where end of jmp will be. */
+	addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
+	memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
+	return (-4*1024*1024) + off;
+}
+
+#endif
+/* We bounce through the trap page, for two reasons: firstly, we need
+   the interrupt destination always mapped, to avoid double faults,
+   secondly we want to reload %gs to make it innocuous on entering kernel.
+ */
+/* guest kernel will not be mapped. we'd better do another schema */
+static void setup_idt(struct lguest_vcpu *vcpu,
+		      unsigned int i,
+		      const struct gate_struct *desc)
+{
+	u64 taddr;
+
+	/* Not present? */
+	if (!desc->p) {
+		/* FIXME: When we need this, we'll know... */
+		if (vcpu->idt_table[i].p)
+			kill_guest(vcpu->guest, "trying to remove irq line %i:"
+					"removing interrupts not supported",i);
+		return;
+	}
+
+#if 0
+	/* We could reflect and disable interrupts, but guest can do itself. */
+	if (desc->type != 0xF)
+		kill_guest(vcpu->guest, "bad direct IDT %i type 0x%x",
+								i, desc->type);
+#endif
+
+	/* FIXME: We may need to fix segment? */
+	_lguest_set_gate(&vcpu->idt_table[i], desc->type, GUEST_DPL, taddr, 0);
+#if 0
+	taddr = setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000));
+#endif
+}
+
+void load_guest_idt_entry(struct lguest_vcpu *vcpu, unsigned int i,
+				struct gate_struct *d)
+{
+	switch (i) {
+	/* Ignore NMI, doublefault, hypercall, spurious interrupt. */
+	case 2:
+	case 8:
+	case 14:
+	case 15:
+	case LGUEST_TRAP_ENTRY:
+	/* FIXME: We should handle debug and int3 */
+	case 1:
+	case 3:
+		return;
+	default:
+		copy_trap(vcpu,i,d);
+	}
+}
+
Index: work-pv/arch/x86_64/lguest/lguest.c
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest.c
@@ -0,0 +1,705 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2007, Glauber de Oliveira Costa <gcosta@xxxxxxxxxx>
+ *                     Steven Rostedt <srostedt@xxxxxxxxxx>
+ *                     Red Hat Inc
+ * Standing on the shoulders of Rusty Russell.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/pfn.h>
+#include <asm/bootsetup.h>
+#include <asm/paravirt.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/pda.h>
+#include <asm/asm-offsets.h>
+#include <asm/mce.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+
+struct lguest_data lguest_data;
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot = (void*)__START_KERNEL_map;
+static struct lguest_text_ptr code_stack[2];
+extern int acpi_disabled;
+extern int acpi_ht;
+
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
+
+static DEFINE_SPINLOCK(hcall_print_lock);
+#define HCALL_BUFF_SIZ 1024
+static char hcall_buff[HCALL_BUFF_SIZ];
+
+/* Set to true when the lguest_init is called. */
+static int lguest_paravirt;
+
+struct lguest_print_ops {
+	void (*vprint)(const char *fmt, va_list ap);
+} *lguest_pops;
+
+void lguest_vprint(const char *fmt, va_list ap)
+{
+	if (lguest_pops)
+		lguest_pops->vprint(fmt, ap);
+}
+
+void lguest_print(const char *fmt, ...)
+{
+	va_list ap;
+
+	/* irq save? */
+	va_start(ap, fmt);
+	lguest_vprint(fmt, ap);
+	va_end(ap);
+}
+
+static void __lguest_vprint(const char *fmt, va_list ap)
+{
+	/* need to do this with interrupts disabled */
+//	spin_lock(&hcall_print_lock);
+	vsnprintf(hcall_buff, HCALL_BUFF_SIZ-1, fmt, ap);
+
+	hcall(LHCALL_PRINT, __pa(hcall_buff), 0, 0);
+//	spin_unlock(&hcall_print_lock);
+}
+
+struct lguest_print_ops local_pops = {__lguest_vprint };
+
+void lguest_set_debug(int d)
+{
+	if (lguest_paravirt)
+		hcall(LHCALL_DEBUG_ME, d, 0, 0);
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	/* Note: This code assumes we're uniprocessor. */
+	static unsigned int next_call;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (lguest_data.hcall_status[next_call] != 0xFF) {
+		/* Table full, so do normal hcall which will flush table. */
+		hcall(call, arg1, arg2, arg3);
+	} else {
+		lguest_data.hcalls[next_call].eax = call;
+		lguest_data.hcalls[next_call].edx = arg1;
+		lguest_data.hcalls[next_call].ebx = arg2;
+		lguest_data.hcalls[next_call].ecx = arg3;
+		wmb();
+		lguest_data.hcall_status[next_call] = 0;
+		if (++next_call == LHCALL_RING_SIZE)
+			next_call = 0;
+	}
+	local_irq_restore(flags);
+}
+
+#ifdef PARAVIRT_LAZY_NONE 	/* Not in 2.6.20. */
+static int lazy_mode;
+static void lguest_lazy_mode(int mode)
+{
+	lazy_mode = mode;
+	if (mode == PARAVIRT_LAZY_NONE)
+		hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2,
+		       unsigned long arg3)
+{
+	if (lazy_mode == PARAVIRT_LAZY_NONE)
+		hcall(call, arg1, arg2, arg3);
+	else
+		async_hcall(call, arg1, arg2, arg3);
+}
+#else
+#define lazy_hcall hcall
+#endif
+
+static unsigned long save_fl(void)
+{
+	return lguest_data.irq_enabled;
+}
+
+static void restore_fl(unsigned long flags)
+{
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = flags;
+}
+
+static void irq_disable(void)
+{
+	lguest_data.irq_enabled = 0;
+}
+
+static void irq_enable(void)
+{
+	/* Linux i386 code expects bit 9 set. */
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = 512;
+}
+
+static void lguest_load_gdt(const struct desc_ptr *desc)
+{
+	/* Does nothing. HV should have done everything for us */
+}
+
+static void lguest_load_idt(const struct desc_ptr *desc)
+{
+	unsigned int i;
+	struct gate_struct *idt = (void *)desc->address;
+
+	for (i = 0; i < (desc->size+1)/16; i++) {
+		hcall(LHCALL_LOAD_IDT_ENTRY, i, __pa((u64)&idt[i]), 0);
+	}
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+	hcall(LHCALL_CRASH, __pa(p), 0, 0);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+	.notifier_call = lguest_panic
+};
+
+static void lguest_memory_setup(void)
+{
+	/* We do this here because lockcheck barfs if before start_kernel */
+	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+}
+
+static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	int is_feature = (*eax == 1);
+
+	native_cpuid(eax, ebx, ecx, edx);
+	if (is_feature) {
+		unsigned long *excap = (unsigned long *)ecx,
+			*features = (unsigned long *)edx;
+		/* Hypervisor needs to know when we flush kernel pages. */
+		set_bit(X86_FEATURE_PGE, features);
+		/* We don't have any features! */
+		clear_bit(X86_FEATURE_VME, features);
+		clear_bit(X86_FEATURE_DE, features);
+		clear_bit(X86_FEATURE_PSE, features);
+		clear_bit(X86_FEATURE_PAE, features);
+		clear_bit(X86_FEATURE_SEP, features);
+		clear_bit(X86_FEATURE_APIC, features);
+		clear_bit(X86_FEATURE_MTRR, features);
+		/* No MWAIT, either */
+		clear_bit(3, excap);
+	}
+}
+
+static unsigned long current_cr3;
+static void lguest_write_cr3(unsigned long cr3)
+{
+	hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+	current_cr3 = cr3;
+}
+
+static u64 lguest_read_msr(unsigned int msr, int *err)
+{
+	unsigned long val;
+
+	*err = 0;
+	hcall(LHCALL_RDMSR, msr, __pa(&val), 0);
+	return val;
+}
+
+static int lguest_write_msr(unsigned int msr, u64 val)
+{
+	hcall(LHCALL_WRMSR, msr, (unsigned long)val, 0);
+	return val;
+}
+
+static u64 lguest_read_tsc(void)
+{
+	/* we don't use natives, otherwise they can recurse */
+	unsigned int a,b;
+	asm volatile("rdtsc" : "=a" (a), "=d" (b));
+	return a | (unsigned long)(b) << 32 ;
+}
+
+static void lguest_flush_tlb(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void lguest_flush_tlb_kernel(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void lguest_flush_tlb_single(u64 addr)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB_SIG, current_cr3, addr, 0);
+}
+
+static void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	hcall(LHCALL_SET_PTE, current_cr3, __pa(ptep), pte_val(pteval));
+}
+
+static void lguest_set_pte_at(struct mm_struct *mm, u64 addr, pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), __pa(ptep), pte_val(pteval));
+}
+
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+	lazy_hcall(LHCALL_SET_PMD, current_cr3, __pa(pmdp)&PTE_MASK,
+		   (__pa(pmdp)&(PAGE_SIZE-1))/8);
+}
+
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+	*pudp = pudval;
+	lazy_hcall(LHCALL_SET_PUD, current_cr3, __pa(pudp)&PTE_MASK,
+		   (__pa(pudp)&(PAGE_SIZE-1))/8);
+}
+
+static void lguest_set_pgd(pgd_t *pgdp, pgd_t pgdval)
+{
+	*pgdp = pgdval;
+	lazy_hcall(LHCALL_SET_PGD, current_cr3, __pa(pgdp)&PTE_MASK,
+		   (__pa(pgdp)&(PAGE_SIZE-1))/8);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void lguest_apic_write(unsigned long reg, unsigned int v)
+{
+}
+
+static unsigned int lguest_apic_read(unsigned long reg)
+{
+	return 0;
+}
+#endif
+
+#if 0
+/* We move eflags word to lguest_data.irq_enabled to restore interrupt
+   state.  For page faults, gpfs and virtual interrupts, the
+   hypervisor has saved eflags manually, otherwise it was delivered
+   directly and so eflags reflects the real machine IF state,
+   ie. interrupts on.  Since the kernel always dies if it takes such a
+   trap with interrupts disabled anyway, turning interrupts back on
+   unconditionally here is OK. */
+asm("lguest_iret:"
+    " pushq	%rax;"
+    " movq	0x18(%rsp), %rax;"
+    "lguest_noirq_start:;"
+    " movq	%rax, lguest_data+"__stringify(LGUEST_DATA_irq_enabled)";"
+    " popq	%rax;"
+    " iretq;"
+    "lguest_noirq_end:");
+extern char lguest_noirq_start[], lguest_noirq_end[];
+#endif
+
+extern void lguest_iret(void);
+asm("lguest_iret:"
+    "  movq  $" __stringify(LHCALL_IRET) ", %rax\n"
+    "  int   $" __stringify(LGUEST_TRAP_ENTRY) );
+
+
+static void lguest_load_rsp0(struct tss_struct *tss,
+				     struct thread_struct *thread)
+{
+	lazy_hcall(LHCALL_SET_STACK, thread->rsp0, THREAD_SIZE/PAGE_SIZE, 0);
+}
+
+static void lguest_load_tr_desc(void)
+{
+}
+
+static void lguest_set_ldt(const void *addr, unsigned entries)
+{
+	/* FIXME: Implement. */
+	BUG_ON(entries);
+}
+
+static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static void lguest_set_debugreg(int regno, unsigned long value)
+{
+	/* FIXME: Implement */
+}
+
+static unsigned int lguest_cr0;
+static void lguest_clts(void)
+{
+	lazy_hcall(LHCALL_TS, 0, 0, 0);
+	lguest_cr0 &= ~8U;
+}
+
+static unsigned long lguest_read_cr0(void)
+{
+	return lguest_cr0;
+}
+
+static void lguest_write_cr0(unsigned long val)
+{
+	hcall(LHCALL_TS, val & 8, 0, 0);
+	lguest_cr0 = val;
+}
+
+static unsigned long lguest_read_cr2(void)
+{
+	return lguest_data.cr2;
+}
+
+static unsigned long lguest_read_cr3(void)
+{
+	return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static unsigned long lguest_read_cr4(void)
+{
+	return 0;
+}
+
+static void lguest_write_cr4(unsigned long val)
+{
+}
+
+static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+	do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+	update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+	set_bit(irq, lguest_data.interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+	clear_bit(irq, lguest_data.interrupts);
+	/* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+	.name		= "lguest",
+	.mask		= disable_lguest_irq,
+	.mask_ack	= disable_lguest_irq,
+	.unmask		= enable_lguest_irq,
+};
+
+static void lguest_time_init(void)
+{
+	set_irq_handler(0, lguest_time_irq);
+	hcall(LHCALL_TIMER_START,HZ,0,0);
+}
+
+static void lguest_ebda_info(unsigned *addr, unsigned *size)
+{
+	*addr = *size = 0;
+}
+
+/* From i8259.c */
+extern void (*interrupt[])(void);
+static void __init lguest_init_IRQ(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_IRQS; i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		/* FIXTHEM: We should be doing it in a lot of other places */
+		if (vector != IA32_SYSCALL_VECTOR) {
+			printk("Setting vector %x as %p\n",vector, &interrupt[i]);
+			set_intr_gate(vector, interrupt[i]);
+			set_irq_chip_and_handler(i, &lguest_irq_controller,
+							 handle_level_irq);
+			hcall(LHCALL_LOAD_IDT_ENTRY, vector, __pa((u64)&idt_table[vector]), 0);
+		}
+	}
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
+static void lguest_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	/* FIXME: Allow this. */
+	BUG();
+}
+
+static void lguest_write_gdt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static void lguest_write_idt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_CRASH, 0, 0 ,0);
+	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+#define LGUEST_IRQ "lguest_data+"__stringify(LGUEST_DATA_irq_enabled)
+#define DEF_LGUEST(name, code)				\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name ":")
+DEF_LGUEST(cli, "movl $0," LGUEST_IRQ);
+DEF_LGUEST(sti, "movl $512," LGUEST_IRQ);
+DEF_LGUEST(popf, "movl %eax," LGUEST_IRQ);
+DEF_LGUEST(pushf, "movl " LGUEST_IRQ ",%eax");
+DEF_LGUEST(pushf_cli, "movl " LGUEST_IRQ ",%eax; movl $0," LGUEST_IRQ);
+DEF_LGUEST(iret, ".byte 0xE9,0,0,0,0"); /* jmp ... */
+
+static const struct lguest_insns
+{
+	const char *start, *end;
+} lguest_insns[] = {
+	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+		return len;
+
+	insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return len;
+
+	memcpy(insns, lguest_insns[type].start, insn_len);
+	if (type == PARAVIRT_INTERRUPT_RETURN) {
+		/* Jumps are relative. */
+		u64 off = (u64)lguest_iret - ((u64)insns + insn_len);
+		memcpy(insns+1, &off, sizeof(off));
+	}
+	return insn_len;
+}
+
+static void lguest_safe_halt(void)
+{
+	hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+	return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static void lguest_syscall_init(void)
+{
+	/* FIXME: Will have to implement it later */
+}
+
+static __attribute_used__ __init void lguest_init(void)
+{
+	int i;
+
+	current_cr3 = __pa(&boot_level4_pgt);
+	paravirt_ops.name = "lguest";
+	paravirt_ops.mem_type = "LGUEST";
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.syscall_init = lguest_syscall_init;
+
+	paravirt_ops.save_fl = save_fl;
+	paravirt_ops.restore_fl = restore_fl;
+	paravirt_ops.irq_disable = irq_disable;
+	paravirt_ops.irq_enable = irq_enable;
+	paravirt_ops.load_gdt = lguest_load_gdt;
+	paravirt_ops.memory_setup = lguest_memory_setup;
+	paravirt_ops.cpuid = lguest_cpuid;
+	paravirt_ops.write_cr3 = lguest_write_cr3;
+	paravirt_ops.read_msr = lguest_read_msr,
+	paravirt_ops.write_msr = lguest_write_msr,
+	paravirt_ops.read_tsc = lguest_read_tsc,
+	paravirt_ops.flush_tlb_user = lguest_flush_tlb;
+	paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+	paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+	paravirt_ops.set_pte = lguest_set_pte;
+	paravirt_ops.set_pte_at = lguest_set_pte_at;
+	paravirt_ops.set_pmd = lguest_set_pmd;
+	paravirt_ops.set_pud = lguest_set_pud;
+	paravirt_ops.set_pgd = lguest_set_pgd;
+#ifdef CONFIG_X86_LOCAL_APIC
+	paravirt_ops.apic_write = lguest_apic_write;
+	paravirt_ops.apic_read = lguest_apic_read;
+#endif
+	paravirt_ops.load_idt = lguest_load_idt;
+	paravirt_ops.iret = lguest_iret;
+	paravirt_ops.load_rsp0 = lguest_load_rsp0;
+	paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+	paravirt_ops.set_ldt = lguest_set_ldt;
+	paravirt_ops.load_tls = lguest_load_tls;
+	paravirt_ops.set_debugreg = lguest_set_debugreg;
+	paravirt_ops.clts = lguest_clts;
+	paravirt_ops.read_cr0 = lguest_read_cr0;
+	paravirt_ops.write_cr0 = lguest_write_cr0;
+	paravirt_ops.init_IRQ = lguest_init_IRQ;
+	paravirt_ops.read_cr2 = lguest_read_cr2;
+	paravirt_ops.read_cr3 = lguest_read_cr3;
+	paravirt_ops.read_cr4 = lguest_read_cr4;
+	paravirt_ops.write_cr4 = lguest_write_cr4;
+	paravirt_ops.write_ldt_entry = lguest_write_ldt_entry;
+	paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+	paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+	paravirt_ops.patch = lguest_patch;
+	paravirt_ops.safe_halt = lguest_safe_halt;
+	paravirt_ops.get_wallclock = lguest_get_wallclock;
+	paravirt_ops.time_init = lguest_time_init;
+#ifdef PARAVIRT_LAZY_NONE
+	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+#endif
+	paravirt_ops.ebda_info = lguest_ebda_info;
+
+	memset(lguest_data.hcall_status,0xFF,sizeof(lguest_data.hcall_status));
+#if 0
+	lguest_data.noirq_start = (u64)lguest_noirq_start;
+	lguest_data.noirq_end = (u64)lguest_noirq_end;
+#endif
+	lguest_data.start_kernel_map = __START_KERNEL_map; /* current page offset */
+	lguest_data.page_offset = PAGE_OFFSET;
+
+	code_stack[0].next = __pa(&code_stack[1]);
+	code_stack[0].start = (unsigned long)_stext;
+	code_stack[0].end = (unsigned long)_etext;
+	code_stack[1].next = 0;
+	code_stack[1].start = (unsigned long)_sinittext;
+	code_stack[1].end = (unsigned long)_einittext;
+
+	lguest_data.text = __pa(&code_stack[0]);
+
+	lguest_data.kallsyms_addresses = __pa(&kallsyms_addresses);
+	lguest_data.kallsyms_num_syms = kallsyms_num_syms;
+	lguest_data.kallsyms_names = __pa(&kallsyms_names);
+	lguest_data.kallsyms_token_table = __pa(&kallsyms_token_table);
+	lguest_data.kallsyms_token_index = __pa(&kallsyms_token_index);
+	lguest_data.kallsyms_markers = __pa(&kallsyms_markers);
+
+	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+
+	lguest_pops = &local_pops;
+	lguest_paravirt = 1;
+
+	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
+	lguest_write_cr3(__pa_symbol(&init_level4_pgt));
+
+ 	for (i = 0; i < NR_CPUS; i++)
+ 		cpu_pda(i) = &boot_cpu_pda[i];
+
+	pda_init(0);
+//	copy_bootdata(real_mode_data);
+#ifdef CONFIG_SMP
+	cpu_set(0, cpu_online_map);
+#endif
+
+//	strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+	/* We use top of mem for initial pagetables. */
+//	init_pg_tables_end = __pa(pg0);
+
+//	reserve_top_address(lguest_data.reserve_mem);
+
+	/* FIXME: Better way? */
+	/* Suppress vgacon startup code */
+	SCREEN_INFO.orig_video_isVGA = VIDEO_TYPE_VLFB;
+
+	add_preferred_console("hvc", 0, NULL);
+/*
+#ifdef CONFIG_X86_MCE
+	mcheck_disable(NULL);
+#endif
+*/
+#ifdef CONFIG_ACPI
+	acpi_disabled = 1;
+	acpi_ht = 0;
+#endif
+	if (boot->initrd_size) {
+		/* We stash this at top of memory. */
+		INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+		INITRD_SIZE = boot->initrd_size;
+		LOADER_TYPE = 0xFF;
+	}
+	pm_power_off = lguest_power_off;
+
+	start_kernel();
+}
+
+asm("lguest_maybe_init:\n"
+    "	cmpq $"__stringify(LGUEST_MAGIC_R13)", %r13\n"
+    "	jne 1f\n"
+    "	cmpq $"__stringify(LGUEST_MAGIC_R14)", %r14\n"
+    "	jne 1f\n"
+    "	cmpq $"__stringify(LGUEST_MAGIC_R15)", %r15\n"
+    "	je lguest_init\n"
+    "1: ret");
+
+extern void asmlinkage lguest_maybe_init(void);
+paravirt_probe(lguest_maybe_init);
Index: work-pv/arch/x86_64/lguest/lguest.h
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest.h
@@ -0,0 +1,161 @@
+#ifndef _LGUEST_GUEST_H_
+#define _LGUEST_GUEST_H_
+
+#define GUEST_DPL 0x3
+
+#define gdt_index(x) ((x) >> 3)
+
+/*
+ * Must be less than fixmap!
+ *
+ * To keep the hypervisor from needing any data sections,
+ * we need to hard code the difference between what the hypervisor
+ * may put into the GS base, and what we let the guest put in.
+ * We allow the guest to put in "Kernel addresses" to simplify
+ * the guest PDA code.
+ */
+#define LGUEST_HV_OFFSET_HIGH 0xffffffff
+#define LGUEST_HV_OFFSET_LOW  0xff000000
+
+#define LGUEST_NMI_IST 7
+
+#define LGUEST_MAGIC 0x6c6775657374 /* "lguest" */
+
+#ifndef __ASSEMBLY__
+#include <asm/lguest.h>
+
+extern void switch_to_guest(struct lguest_vcpu *);
+extern unsigned long hcall_teste;
+extern unsigned long host_syscall;
+extern unsigned long _lguest_default_idt_entries[];
+extern unsigned long lguest_hv_addr;
+extern unsigned long lguest_hv_offset;
+extern int lguest_hv_pages;
+extern int lguest_vcpu_pages;
+extern int lguest_vcpu_order;
+extern struct mutex lguest_lock;
+
+/* FIXME: Those would live better in some main kernel header */
+/* Page fault error code bits */
+#define PF_PROT	(1<<0)		/* or no page found */
+#define PF_WRITE	(1<<1)
+#define PF_USER	(1<<2)
+#define PF_RSVD	(1<<3)
+#define PF_INSTR	(1<<4)
+
+#define kill_guest(guest, fmt...)				\
+do {								\
+	if (!(guest)->dead) {					\
+		(guest)->dead = kasprintf(GFP_ATOMIC, fmt);	\
+		if (!(guest)->dead)				\
+			(guest)->dead = (void *)-1;		\
+	}							\
+} while (0)
+
+#define kill_guest_dump(vcpu, fmt...)		\
+do {						\
+	kill_guest((vcpu)->guest, fmt);		\
+	lguest_dump_vcpu_regs(vcpu);		\
+}  while(0)
+
+static inline void _lguest_set_gate(struct gate_struct *s, unsigned type, unsigned long func,
+				    unsigned dpl, unsigned ist)
+{
+        s->offset_low = PTR_LOW(func);
+        s->segment = __HV_CS;
+        s->ist = ist;
+        s->p = 1;
+        s->dpl = dpl;
+        s->zero0 = 0;
+        s->zero1 = 0;
+        s->type = type;
+        s->offset_middle = PTR_MIDDLE(func);
+        s->offset_high = PTR_HIGH(func);
+}
+
+static inline unsigned long guest_pa(struct lguest_guest_info *linfo, u64 addr)
+{
+	return (addr >= linfo->start_kernel_map) ?
+		(addr - linfo->start_kernel_map) :
+		(addr - linfo->page_offset);
+}
+
+int lguest_address_ok(const struct lguest_guest_info *, u64);
+
+int demand_page(struct lguest_vcpu *, u64, int);
+/* FIXME: put this in hv_vm.h */
+unsigned long hvvm_get_actual_phys(void *addr, pgprot_t *prot);
+
+int lguest_device_init(void);
+void lguest_device_remove(void);
+
+/* page_tables.h */
+int lguest_map_hv_pages(struct lguest_guest_info *lguest,
+			   unsigned long vaddr, int pages,
+			   pgprot_t *prot);
+int lguest_map_guest_page(struct lguest_guest_info *lguest,
+			  unsigned long vaddr, unsigned long paddr,
+			  pgprot_t prot);
+void lguest_unmap_guest_pages(struct lguest_guest_info *lguest,
+			      unsigned long vaddr, int pages);
+void lguest_free_guest_pages(struct lguest_guest_info *lguest);
+
+void *lguest_mem_addr(struct lguest_vcpu *vcpu, u64 vaddr);
+
+void guest_set_pte(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long idx);
+void guest_set_pmd(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long val);
+void guest_set_pud(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long val);
+void guest_set_pgd(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long val);
+void guest_flush_tlb_single(struct lguest_vcpu *vcpu, u64 cr3, u64 vaddr);
+void guest_pagetable_clear_all(struct lguest_vcpu *vcpu);
+void guest_pagetable_flush_user(struct lguest_vcpu *vcpu);
+void guest_new_pagetable(struct lguest_vcpu *vcpu, u64 pgtable);
+
+int init_guest_pagetable(struct lguest_guest_info *linfo, u64 pgtable);
+int lguest_init_vcpu_pagetable(struct lguest_vcpu *vcpu);
+
+int hypercall(struct lguest_vcpu *vcpu);
+
+/* core.c */
+u8 lhread_u8(struct lguest_vcpu *vcpu, u64 addr);
+u16 lhread_u16(struct lguest_vcpu *vcpu, u64 addr);
+u64 lhread_u64(struct lguest_vcpu *vcpu, u64 addr);
+void lhwrite_u64(struct lguest_vcpu *vcpu, u64 addr, u64 val);
+
+void lhread(struct lguest_guest_info *, void *, u64, unsigned);
+void lhwrite(struct lguest_guest_info *, u64, const void *, unsigned);
+
+/* io.c */
+u32 bind_dma(struct lguest_guest_info *, unsigned long, unsigned long,
+					u16, u8);
+int send_dma(struct lguest_guest_info *, unsigned long, unsigned long);
+
+/* interrupts_and_traps.c */
+
+void load_guest_idt_entry(struct lguest_vcpu *, unsigned int,
+						struct gate_struct *);
+void maybe_do_interrupt(struct lguest_vcpu *);
+void guest_iret(struct lguest_vcpu *vcpu);
+int reflect_trap(struct lguest_vcpu *, int, int);
+
+/* lguest_debug.c */
+extern int lguest_debug;
+void lgdebug_print(const char *fmt, ...);
+void lgdebug_vprint(const char *fmt, va_list ap);
+void lguest_dump_vcpu_regs(struct lguest_vcpu *vcpu);
+void lguest_dump_trace(struct lguest_vcpu *vcpu, struct lguest_regs *regs);
+void lguest_print_address(struct lguest_vcpu *vcpu, unsigned long address);
+void lguest_print_page_tables(u64 *cr3);
+void lguest_print_guest_page_tables(struct lguest_vcpu *vcpu, u64 cr3);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif
Index: work-pv/arch/x86_64/lguest/lguest_user.c
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/lguest_user.c
@@ -0,0 +1,436 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <asm/lguest_user.h>
+#include <asm/hv_vm.h>
+#include "lguest.h"
+
+static int next_guest_id;
+
+#if 0
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long addr, udma, irq;
+
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	udma = get_dma_buffer(lg, addr, &irq);
+	if (!udma)
+		return -ENOENT;
+
+	/* We put irq number in udma->used_len. */
+	lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+	return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+	u32 irq;
+
+	if (get_user(irq, input) != 0)
+		return -EFAULT;
+	if (irq >= LGUEST_IRQS)
+		return -EINVAL;
+	set_bit(irq, lg->irqs_pending);
+	return 0;
+}
+#endif
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+	struct lguest_vcpu *vcpu = file->private_data;
+	struct lguest_guest_info *linfo = vcpu->guest;
+	int ret;
+
+	if (!vcpu)
+		return -EINVAL;
+
+	if (linfo->dead) {
+		size_t len;
+
+		if (linfo->dead == (void *)-1)
+			return -ENOMEM;
+
+		len = min(size, strlen(linfo->dead)+1);
+		if (copy_to_user(user, linfo->dead, len) != 0)
+			return -EFAULT;
+		return len;
+	}
+
+#if 0
+	if (lg->dma_is_pending)
+		lg->dma_is_pending = 0;
+#endif
+
+	ret = run_guest(vcpu, user);
+	if (ret != -EINTR)
+		ret = -ENOENT;
+	return ret;
+}
+
+struct lguest_vcpu *allocate_vcpu(struct lguest_guest_info *linfo)
+{
+	struct lguest_vcpu *vcpu;
+	unsigned long hv_vcpu;
+	int ret;
+
+	vcpu = (void*)__get_free_pages(GFP_KERNEL, lguest_vcpu_order);
+	if (!vcpu)
+		return NULL;
+	memset(vcpu, 0, sizeof(*vcpu));
+
+	ret = hvvm_map_pages(vcpu, lguest_vcpu_pages, &hv_vcpu);
+	if (ret < 0)
+		goto out;
+
+	ret = lguest_map_hv_pages(linfo, hv_vcpu, lguest_vcpu_pages, NULL);
+	if (ret < 0)
+		goto out2;
+
+	vcpu->host_page = (unsigned long)vcpu;
+
+	return (struct lguest_vcpu*)hv_vcpu;
+
+out2:
+	hvvm_unmap_pages(hv_vcpu, lguest_vcpu_pages);
+out:
+	free_pages((unsigned long)vcpu, lguest_vcpu_order);
+
+	return NULL;
+}
+
+void free_vcpu(struct lguest_guest_info *linfo, struct lguest_vcpu *vcpu)
+{
+	unsigned long hv_vcpu = (unsigned long)vcpu;
+	free_pages(vcpu->host_page, lguest_vcpu_order);
+	lguest_unmap_guest_pages(linfo, hv_vcpu, lguest_vcpu_pages);
+	hvvm_unmap_pages(hv_vcpu, lguest_vcpu_pages);
+	lguest_free_guest_pages(linfo);
+}
+
+#if 0
+static void print_tss(struct ldttss_desc *tss)
+{
+	u64 base;
+	u64 limit;
+	int i;
+	u16 iobp = 0x64;
+
+	base = (tss->base0) + ((u64)tss->base1 << 16) +
+		((u64)tss->base2 << 24) + ((u64)tss->base3 << 32);
+	limit = (tss->limit0) + ((u64)tss->limit1 << 16);
+	if (tss->g)
+		limit <<= 12;
+	printk("    base: %016llx\n", base);
+	printk("   limit: %llx\n", limit);
+	printk("    type: %x\n", tss->type);
+	printk("     dpl: %d\n", tss->dpl);
+	printk("       p: %d\n", tss->p);
+	printk("       g: %d\n", tss->g);
+
+	for (i=0; i < limit; i += 4) {
+		printk("   %8x: %08x\n", i, *(u32*)(base+i));
+		if (i == 0x64) {
+			iobp = (u16)((*(u32*)(base+i))>>16);
+		}
+		if (i >= iobp && *(s32*)(base+i) == -1L)
+			break;
+	}
+}
+#endif
+
+/* should be in some other file ? */
+int vcpu_start(int cpu, struct lguest_guest_info *linfo,
+				unsigned long entry_point,
+				void *pgd)
+{
+	struct lguest_vcpu *vcpu;
+	struct desc_struct *gdt_table;
+	struct lguest_regs *regs;
+	struct ldttss_desc *tss;
+	struct lguest_tss_struct *tss_ptr;
+	u64 target;
+	u64 limit;
+	u64 base;
+	int i;
+
+	if (cpu > LGUEST_MAX_VCPUS)
+		return -EINVAL;
+
+	vcpu = allocate_vcpu(linfo);
+	if (!vcpu)
+		return -ENOMEM;
+
+	printk("vcpu: %p\n", vcpu);
+
+	/*
+	 * Point back to itself to make it easier to read from gs:base in
+	 * hypervisor.S
+	 */
+	vcpu->vcpu = vcpu;
+	vcpu->magic = LGUEST_MAGIC;
+	gdt_table = cpu_gdt(get_cpu());
+	put_cpu();
+
+	/* Our gdt is basically host's, except for the privilege level */
+	for (i = 0; i < GDT_ENTRIES; i++) {
+		vcpu->gdt_table[i] = gdt_table[i];
+
+		if (!gdt_table[i].type)
+			continue;
+
+		switch (i) {
+		/* Keep TSS, and HV, and Host KERNEL segments the same */
+		case GDT_ENTRY_TSS:
+			/* The TSS will be modified below */
+		case GDT_ENTRY_HV_CS:
+		case GDT_ENTRY_HV_DS:
+		case __KERNEL_CS >> 3:
+		case __KERNEL_DS >> 3:
+			break;
+		default:
+			vcpu->gdt_table[i].dpl = GUEST_DPL;
+		}
+	}
+
+	for (i = 0; i < IDT_ENTRIES; i++) {
+		unsigned dpl = i == LGUEST_TRAP_ENTRY ? GUEST_DPL : 0;
+		/* NMI gets its own stack */
+		int ist = (i == 2) ? LGUEST_NMI_IST :
+			/* temp debug for now */
+			(i == 8) ? 6 :   /* Double Fault */
+//			(i == 13) ? 5 :  /* GPF */
+			0;
+
+		_lguest_set_gate(&vcpu->idt_table[i], 0xe,
+				 _lguest_default_idt_entries[i] +
+				 lguest_hv_offset, dpl, ist);
+	}
+
+	vcpu->gdt.size = 8 * GDT_ENTRIES - 1;
+	vcpu->gdt.address = (unsigned long)&vcpu->gdt_table;
+
+	vcpu->idt.size = 16 * IDT_ENTRIES -1;
+	vcpu->idt.address = (unsigned long)vcpu->idt_table;
+	rdmsrl(MSR_LSTAR, vcpu->host_syscall);
+
+	vcpu->id = cpu;
+	vcpu->guest = linfo;
+	linfo->vcpu[cpu] = vcpu;
+
+	lguest_init_vcpu_pagetable(vcpu);
+
+	/* setup the tss */
+	tss = (struct ldttss_desc*)&vcpu->gdt_table[GDT_ENTRY_TSS];
+	limit = sizeof(struct lguest_tss_struct);
+	base = (u64)&vcpu->tss;
+	tss->limit0 = (u16)limit;
+	tss->base0 = (u16)base;
+	tss->base1 = (u8)(base>>16);
+	tss->base2 = (u8)(base>>24);
+	tss->base3 = (u32)(base>>32);
+	tss->type = 0x9;
+	tss->g = 0; /* small tss */
+
+	vcpu->tss.rsp0 = (unsigned long)(&vcpu->regs.size);
+
+	/* NMI can happen at any time, so give it its own stack */
+	vcpu->tss.ist[LGUEST_NMI_IST-1] = (unsigned long)(&vcpu->nmi_stack_end);
+	printk("nmi stack at: %llx\n", vcpu->tss.ist[LGUEST_NMI_IST-1]);
+
+	/* temp debug stuff */
+	vcpu->tss.ist[5-1] = (unsigned long)(&vcpu->gpf_stack_end);
+	vcpu->tss.ist[6-1] = (unsigned long)(&vcpu->df_stack_end);
+	/*
+	 * Load the host nmi stack into the guest tss. This prevents races
+	 * in loading the TR and IDT.
+	 */
+	tss = (struct ldttss_desc *)&gdt_table[GDT_ENTRY_TSS];
+	target = (u64)tss->base0 |
+		((u64)tss->base1 << 16) |
+		((u64)tss->base2 << 24) |
+		((u64)tss->base3 << 32);
+
+	tss_ptr = (struct lguest_tss_struct*)target;
+
+	vcpu->tss.ist[NMI_STACK-1] = tss_ptr->ist[NMI_STACK-1];
+
+	/*
+	 * The rsp0 had better be on 16 bytes aligned, or the interrupt
+	 * will put the stack at a undesireable location.
+	 */
+	/* Don't remove this test!!! */
+	if (unlikely(vcpu->tss.rsp0 & 0xf)) {
+		printk("HV ALIGNMENT BUG! don't put stack here!!\n");
+		printk(" tss.rsp0 stack was set to %llx\n",
+		       vcpu->tss.rsp0);
+		goto out;
+	}
+
+	vcpu->tss.io_bitmap_base = 0x68;
+	vcpu->tss.io_bitmap[0] = -1UL;
+
+	regs = &vcpu->regs;
+	regs->cr3 = __pa(vcpu->pgdir->pgdir);
+	regs->rax = regs->rbx = regs->rcx = regs->rdx =
+	regs->r8 = regs->r9 = regs->r10 = regs->r11 =
+	regs->r12 = regs->rdi = regs->rsi = regs->rbp = 0;
+	regs->r13 = LGUEST_MAGIC_R13;
+	regs->r14 = LGUEST_MAGIC_R14;
+	regs->r15 = LGUEST_MAGIC_R15;
+	regs->fs = 0;
+	regs->trapnum = 0;
+	regs->errcode = 0;
+	regs->rip = entry_point;
+//	regs->rip = 0x1000100;
+	regs->cs = __USER_CS;
+	regs->rflags = 0x202;   /* Interrupts enabled. */
+	regs->rsp = 0;
+	regs->ss = __USER_DS;
+
+	return 0;
+out:
+	free_vcpu(linfo, vcpu);
+	return -EINVAL;
+}
+
+static int initialize_guest(struct file *file, const u64 __user *input)
+{
+	struct lguest_guest_info *linfo;
+	int err;
+	u64 args[4];
+	int i;
+
+	if (file->private_data)
+		return -EBUSY;
+
+	if (copy_from_user(args, input, sizeof(args)) != 0)
+		return -EFAULT;
+
+	linfo = kzalloc(sizeof(*linfo), GFP_KERNEL);
+	if (!linfo)
+		return -ENOMEM;
+
+	mutex_init(&linfo->page_lock);
+
+	/* FIXME: protect the guest_id counter */
+	linfo->guest_id = ++next_guest_id;
+
+	linfo->pfn_limit = args[0];
+	linfo->page_offset = args[3];
+	linfo->start_kernel_map = args[3];
+
+	mutex_init(&linfo->page_lock);
+	INIT_LIST_HEAD(&linfo->pgd_list);
+
+	for (i=0; i < PUD_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&linfo->pud_hash[i]);
+
+	for (i=0; i < PMD_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&linfo->pmd_hash[i]);
+
+	for (i=0; i < PTE_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&linfo->pte_hash[i]);
+
+	err = init_guest_pagetable(linfo, args[1]);
+	if (err)
+		return -ENOMEM; /* what else to return ?? */
+#if 0
+
+	lg->state = setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+	if (!lg->state) {
+		err = -ENOEXEC;
+		goto release_pgtable;
+	}
+#endif
+	err = vcpu_start(0, linfo, args[2], __va(read_cr3()));
+	if (err < 0)
+		return err;
+
+	file->private_data = linfo->vcpu[0];
+
+	return sizeof(args);
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+		     size_t size, loff_t *off)
+{
+	struct lguest_vcpu *vcpu = file->private_data;
+	u64 req;
+
+	if (get_user(req, input) != 0)
+		return -EFAULT;
+	input += sizeof(req);
+
+	if (req != LHREQ_INITIALIZE && !vcpu)
+		return -EINVAL;
+#if 0
+	if (lg && lg->dead)
+		return -ENOENT;
+#endif
+
+	switch (req) {
+	case LHREQ_INITIALIZE:
+		return initialize_guest(file, (const u64 __user *)input);
+#if 0
+	case LHREQ_GETDMA:
+		return user_get_dma(lg, (const u32 __user *)input);
+	case LHREQ_IRQ:
+		return user_send_irq(lg, (const u32 __user *)input);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+	struct lguest_vcpu *vcpu = file->private_data;
+	struct lguest_guest_info *linfo;
+
+	if (!vcpu)
+		return -EBADFD;
+
+	linfo = vcpu->guest;
+	/* FIXME: need to handle multiple vcpus */
+	free_vcpu(linfo, vcpu);
+	kfree(linfo);
+#if 0
+	mutex_lock(&lguest_lock);
+	release_all_dma(lg);
+	free_page((long)lg->trap_page);
+	free_guest_pagetable(lg);
+	mmput(lg->mm);
+	if (lg->dead != (void *)1)
+		kfree(lg->dead);
+	memset(lg->state, 0, sizeof(*lg->state));
+	memset(lg, 0, sizeof(*lg));
+	mutex_unlock(&lguest_lock);
+#endif
+	return 0;
+}
+
+static struct file_operations lguest_fops = {
+	.owner	 = THIS_MODULE,
+	.release = close,
+	.write	 = write,
+	.read	 = read,
+};
+static struct miscdevice lguest_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "lguest",
+	.fops	= &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+	return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+	misc_deregister(&lguest_dev);
+}
Index: work-pv/arch/x86_64/lguest/page_tables.c
===================================================================
--- /dev/null
+++ work-pv/arch/x86_64/lguest/page_tables.c
@@ -0,0 +1,1285 @@
+/* Shadow page table operations.
+ * Copyright (C) Steven Rostedt, Red Hat Inc, 2007
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include <asm/hv_vm.h>
+#include "lguest.h"
+
+/* move this to hv_vm.h */
+#define HVVM_END (HVVM_START + HV_VIRT_SIZE)
+
+#define HASH_PUD(x) (((u64)(x)>>PAGE_SHIFT) & (PUD_HASH_SIZE-1))
+#define HASH_PMD(x) (((u64)(x)>>PAGE_SHIFT) & (PMD_HASH_SIZE-1))
+#define HASH_PTE(x) (((u64)(x)>>PAGE_SHIFT) & (PTE_HASH_SIZE-1))
+
+/* guest and host share the same offset into the page tables */
+/* 9 bits at 8 byte increments */
+#define guest_host_idx(vaddr) ((vaddr) & (0x1ff<<3))
+
+
+/* These access the guest versions. */
+static u64 gtoplev(struct lguest_vcpu *vcpu, unsigned long vaddr)
+{
+	unsigned index = pgd_index(vaddr);
+
+	return vcpu->pgdir->cr3 + index * sizeof(u64);
+}
+
+
+#if 0
+
+/* FIXME: we need to put these in and make it more secure! */
+static u32 check_pgtable_entry(struct lguest *lg, u32 entry)
+{
+	if ((entry & (_PAGE_PWT|_PAGE_PSE))
+	    || (entry >> PAGE_SHIFT) >= lg->pfn_limit)
+		kill_guest(lg, "bad page table entry");
+	return entry & ~_PAGE_GLOBAL;
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+	unsigned int i;
+	u32 stack = lg->state->tss.esp1;
+
+	for (i = 0; i < lg->stack_pages; i++)
+		if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+			kill_guest(lg, "bad stack page %i@%#x", i, stack);
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+	unsigned int i;
+
+	release_all_pagetables(lg);
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_trap_page(struct lguest *lg)
+{
+	int cpu = smp_processor_id();
+
+	hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
+
+	/* Since hypervisor less that 4MB, we simply mug top pte page. */
+	lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =
+		(__pa(hypervisor_pte_page(cpu))| __PAGE_KERNEL);
+}
+
+#endif
+
+static int __lguest_map_guest_page(struct lguest_guest_info *linfo, u64 *cr3,
+				   unsigned long vaddr, unsigned long paddr,
+				   pgprot_t pprot);
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+	struct vm_area_struct *vma;
+	struct page *page;
+	unsigned long ret = -1UL;
+
+	down_read(&current->mm->mmap_sem);
+	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+			   1, write, 1, &page, &vma) == 1)
+		ret = page_to_pfn(page);
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static int is_hv_page(int pgd_idx, int pud_idx, int pmd_idx, int pte_idx)
+{
+	/* Never release the hv pages */
+	u64 addr = (u64)pgd_idx << PGDIR_SHIFT |
+		(u64)pud_idx << PUD_SHIFT |
+		(u64)pmd_idx << PMD_SHIFT |
+		(u64)pte_idx << PAGE_SHIFT;
+	/* sign extend */
+	if (pgd_idx & (1<<8))
+		addr |= 0xffffULL << 48;
+	return (addr >= HVVM_START) &&
+		(addr < (HVVM_START + HV_VIRT_SIZE));
+}
+
+static void release_pte(u64 pte)
+{
+	if (pte & _PAGE_PRESENT)
+		put_page(pfn_to_page(pte >> PAGE_SHIFT));
+}
+
+static int release_pmd(int pgd_idx, int pud_idx, u64 *pmd, int idx)
+{
+	int save = 0;
+	if (pmd[idx] & _PAGE_PRESENT) {
+		int i;
+		u64 *ptepage = __va(pmd[idx] & PTE_MASK);
+		for (i=0; i < PTRS_PER_PMD; i++)
+			if (is_hv_page(pgd_idx, pud_idx, idx, i))
+				save = 1;
+			else
+				release_pte(ptepage[i]);
+		/* never free the HV pmds */
+		if (!save) {
+			free_page((unsigned long)ptepage);
+			pmd[idx] = 0;
+		}
+	}
+	return save;
+}
+
+static int release_pud(int pgd_idx, u64 *pud, int idx)
+{
+	int save = 0;
+	if (pud[idx] & _PAGE_PRESENT) {
+		int i;
+		u64 *pmdpage = __va(pud[idx] & PTE_MASK);
+		for (i=0; i < PTRS_PER_PUD; i++)
+			if (release_pmd(pgd_idx, idx, pmdpage, i))
+				save = 1;
+		/* never free the HV puds */
+		if (!save) {
+			free_page((unsigned long)pmdpage);
+			pud[idx] = 0;
+		}
+	}
+	return save;
+}
+
+static int release_pgd(u64 *pgd, int idx)
+{
+	int save = 0;
+
+	if (pgd[idx] & _PAGE_PRESENT) {
+		int i;
+		u64 *pudpage = __va(pgd[idx] & PTE_MASK);
+		for (i=0; i < PTRS_PER_PGD; i++) {
+			if (release_pud(idx, pudpage, i))
+				save = 1;
+		}
+		/* never free the HV pgd */
+		if (!save) {
+			free_page((unsigned long)pudpage);
+			pgd[idx] = 0;
+		}
+	}
+	return save;
+}
+
+static struct lguest_pgd *find_pgd(struct lguest_guest_info *linfo, u64 cr3)
+{
+	struct lguest_pgd *pgdir;
+
+	list_for_each_entry(pgdir, &linfo->pgd_list, list)
+		if (!(pgdir->flags & LGUEST_PGD_MASTER_FL) && pgdir->cr3 == cr3)
+			break;
+
+	if (pgdir == list_entry(&linfo->pgd_list, struct lguest_pgd, list))
+		return NULL;
+
+	return pgdir;
+}
+
+static struct lguest_pud *find_pud(struct lguest_guest_info *linfo, u64 gpud)
+{
+	unsigned idx = HASH_PUD(gpud);
+	struct lguest_pud *pudir;
+
+	list_for_each_entry(pudir, &linfo->pud_hash[idx], list)
+		if (pudir->gpud == gpud)
+			break;
+
+	if (pudir == list_entry(&linfo->pud_hash[idx], struct lguest_pud, list))
+		return NULL;
+
+	return pudir;
+}
+
+static struct lguest_pmd *find_pmd(struct lguest_guest_info *linfo, u64 gpmd)
+{
+	unsigned idx = HASH_PMD(gpmd);
+	struct lguest_pmd *pmdir;
+
+	list_for_each_entry(pmdir, &linfo->pmd_hash[idx], list)
+		if (pmdir->gpmd == gpmd)
+			break;
+
+	if (pmdir == list_entry(&linfo->pmd_hash[idx], struct lguest_pmd, list))
+		return NULL;
+
+	return pmdir;
+}
+
+static struct lguest_pte *find_pte(struct lguest_guest_info *linfo, u64 gpte)
+{
+	unsigned idx = HASH_PTE(gpte);
+	struct lguest_pte *pte;
+
+	list_for_each_entry(pte, &linfo->pte_hash[idx], list)
+		if (pte->gpte == gpte)
+			break;
+
+	if (pte == list_entry(&linfo->pte_hash[idx], struct lguest_pte, list))
+		return NULL;
+
+	return pte;
+}
+
+static void __release_pte_hash(struct lguest_vcpu *vcpu, struct lguest_pte *pte)
+{
+	list_del(&pte->list);
+	kfree(pte);
+}
+
+static void __release_pmd_hash(struct lguest_vcpu *vcpu, struct lguest_pmd *pmdir)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pte *pte;
+	int i;
+
+	list_del(&pmdir->list);
+
+	for (i=0; i < PTRS_PER_PMD; i++) {
+		u64 gpte;
+
+		gpte = lhread_u64(vcpu, pmdir->gpmd+i*sizeof(u64));
+		if (!gpte)
+			continue;
+		pte = find_pte(linfo, gpte & PTE_MASK);
+		if (!pte)
+			continue;
+		__release_pte_hash(vcpu, pte);
+	}
+
+	kfree(pmdir);
+}
+
+static void __release_pud_hash(struct lguest_vcpu *vcpu, struct lguest_pud *pudir)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pmd *pmdir;
+	int i;
+
+	list_del(&pudir->list);
+
+	for (i=0; i < PTRS_PER_PUD; i++) {
+		u64 gpmd;
+
+		gpmd = lhread_u64(vcpu, pudir->gpud+i*sizeof(u64));
+		if (!gpmd)
+			continue;
+		pmdir = find_pmd(linfo, gpmd & PTE_MASK);
+		if (!pmdir)
+			continue;
+		__release_pmd_hash(vcpu, pmdir);
+	}
+
+	kfree(pudir);
+}
+
+static struct lguest_pud *hash_pud(struct lguest_vcpu *vcpu, u64 gpud, unsigned idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	unsigned h;
+
+	mutex_lock(&linfo->page_lock);
+	pudir = find_pud(linfo, gpud);
+	if (!pudir) {
+		/* FIXME: make this a slab? */
+		pudir = kzalloc(sizeof(*pudir), GFP_KERNEL);
+		if (!pudir)
+			goto out;
+		h = HASH_PUD(gpud);
+		list_add(&pudir->list, &linfo->pud_hash[h]);
+		pudir->pgdir = vcpu->pgdir;
+		pudir->gpud = gpud;
+		pudir->idx = idx;
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+
+	return pudir;
+}
+
+static struct lguest_pmd *hash_pmd(struct lguest_vcpu *vcpu, struct lguest_pud *pudir,
+				   u64 gpmd, unsigned idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pmd *pmdir;
+	unsigned h;
+
+	mutex_lock(&linfo->page_lock);
+	pmdir = find_pmd(linfo, gpmd);
+	if (!pmdir) {
+		/* FIXME: make this a slab? */
+		pmdir = kzalloc(sizeof(*pmdir), GFP_KERNEL);
+		if (!pmdir)
+			goto out;
+		h = HASH_PMD(gpmd);
+		list_add(&pmdir->list, &linfo->pmd_hash[h]);
+		pmdir->pudir = pudir;
+		pmdir->gpmd = gpmd;
+		pmdir->idx = idx;
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+
+	return pmdir;
+}
+
+static struct lguest_pte *hash_pte(struct lguest_vcpu *vcpu, struct lguest_pmd *pmdir,
+				   u64 gpte, unsigned idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pte *pte;
+	unsigned h;
+
+	mutex_lock(&linfo->page_lock);
+	pte = find_pte(linfo, gpte);
+	if (!pte) {
+		/* FIXME: make this a slab? */
+		pte = kzalloc(sizeof(*pte), GFP_KERNEL);
+		if (!pte)
+			goto out;
+		h = HASH_PTE(gpte);
+		list_add(&pte->list, &linfo->pte_hash[h]);
+		pte->pmdir = pmdir;
+		pte->gpte = gpte;
+		pte->idx = idx;
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+
+	return pte;
+}
+
+void guest_set_pte(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long vaddr,
+		   unsigned long value)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	struct lguest_pmd *pmdir;
+	struct lguest_pte *ptedir;
+	unsigned long idx = (vaddr & (PAGE_SIZE-1)) / 8;
+	u64 base = vaddr & PTE_MASK;
+	u64 pgd;
+	u64 pud;
+	u64 pmd;
+	u64 pte;
+	u64 *pudpage;
+	u64 *pmdpage;
+	u64 *ptepage;
+
+	mutex_lock(&linfo->page_lock);
+
+	ptedir = find_pte(linfo, base);
+	if (!ptedir)
+		goto out;
+
+	pmdir = ptedir->pmdir;
+	pudir = pmdir->pudir;
+
+	pgd = vcpu->pgdir->pgdir[pudir->idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[pmdir->idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	pmdpage = __va(pud & PTE_MASK);
+	pmd = pmdpage[ptedir->idx];
+
+	if (!(pmd & _PAGE_PRESENT))
+		goto out;
+
+	ptepage = __va(pmd & PTE_MASK);
+	pte = ptepage[idx];
+
+	if (!(pte & _PAGE_PRESENT))
+		goto out;
+
+	/* If the guest is trying to touch HV area, kill it! */
+	if (is_hv_page(pudir->idx, pmdir->idx, ptedir->idx, idx)) {
+		kill_guest_dump(vcpu, "guest trying to write to HV area\n");
+		goto out;
+	}
+
+	/* FIXME: perhaps we could set the pte now ? */
+
+	release_pte(ptepage[idx]);
+	__release_pte_hash(vcpu, ptedir);
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pmd(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	struct lguest_pmd *pmdir;
+	u64 pgd;
+	u64 pud;
+	u64 pmd;
+	u64 *pudpage;
+	u64 *pmdpage;
+	int save;
+
+	if (idx >= PTRS_PER_PMD) {
+		kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+		return;
+	}
+
+	mutex_lock(&linfo->page_lock);
+
+	pmdir = find_pmd(linfo, base);
+	if (!pmdir)
+		goto out;
+
+	pudir = pmdir->pudir;
+
+	pgd = vcpu->pgdir->pgdir[pudir->idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[pmdir->idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	pmdpage = __va(pud & PTE_MASK);
+	pmd = pmdpage[idx];
+
+	if (!(pmd & _PAGE_PRESENT))
+		goto out;
+
+	save = release_pmd(pudir->idx, pmdir->idx, pmdpage, idx);
+	if (!save)
+		__release_pmd_hash(vcpu, pmdir);
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pud(struct lguest_vcpu *vcpu,
+		   unsigned long cr3, unsigned long base,
+		   unsigned long idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	u64 pgd;
+	u64 pud;
+	u64 *pudpage;
+	int save;
+
+	if (idx >= PTRS_PER_PUD) {
+		kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+		return;
+	}
+
+	mutex_lock(&linfo->page_lock);
+
+	pudir = find_pud(linfo, base);
+	if (!pudir)
+		goto out;
+
+	pgd = vcpu->pgdir->pgdir[pudir->idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	save = release_pud(pudir->idx, pudpage, idx);
+	if (!save)
+		__release_pud_hash(vcpu, pudir);
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_set_pgd(struct lguest_vcpu *vcpu, unsigned long cr3,
+		   unsigned long base, unsigned long idx)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pgd *pgdir;
+	struct lguest_pud *pudir;
+	u64 gpud;
+	u64 pgd;
+	u64 pud;
+	int save;
+
+	pgdir = vcpu->pgdir;
+
+	if (idx >= PTRS_PER_PGD) {
+		kill_guest_dump(vcpu, "illegal index for pgd (%ld)\n", idx);
+		return;
+	}
+
+	mutex_lock(&linfo->page_lock);
+
+	pgd = pgdir->pgdir[idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pud = pgd & PTE_MASK;
+
+	gpud = lhread_u64(vcpu, base + idx * sizeof(u64));
+	pudir = find_pud(linfo, gpud & PTE_MASK);
+	if (pudir)
+		__release_pud_hash(vcpu, pudir);
+	save = release_pgd(pgdir->pgdir, idx);
+
+	if (!save && idx >= guest_host_idx(linfo->page_offset >> (PGDIR_SHIFT-3))) {
+		/* All guest procesess share the same kernel PML4Es */
+		/*
+		 * So we only free the tree once, but then reset
+		 * all the others.
+		 */
+		list_for_each_entry(pgdir, &linfo->pgd_list, list) {
+			pgd = pgdir->pgdir[idx];
+			if (!(pgd & _PAGE_PRESENT))
+				continue;
+			BUG_ON((pgd & PTE_MASK) != pud);
+			pgdir->pgdir[idx] = 0;
+		}
+	}
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_flush_tlb_single(struct lguest_vcpu *vcpu, u64 cr3, u64 vaddr)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pgd *pgdir;
+	unsigned long pgd_idx;
+	unsigned long pud_idx;
+	unsigned long pmd_idx;
+	unsigned long idx;
+	u64 pgd;
+	u64 pud;
+	u64 pmd;
+	u64 pte;
+	u64 *pudpage;
+	u64 *pmdpage;
+	u64 *ptepage;
+
+	mutex_lock(&linfo->page_lock);
+
+	if (vaddr > linfo->page_offset)
+		pgdir = &linfo->kpgdir;
+	else
+		pgdir = find_pgd(linfo, cr3);
+
+	pgd_idx = pgd_index(vaddr);
+	pgd = pgdir->pgdir[pgd_idx];
+	if (!(pgd & _PAGE_PRESENT))
+		goto out;
+
+	pud_idx = pud_index(vaddr);
+	pudpage = __va(pgd & PTE_MASK);
+	pud = pudpage[pud_idx];
+
+	if (!(pud & _PAGE_PRESENT))
+		goto out;
+
+	pmd_idx = pmd_index(vaddr);
+	pmdpage = __va(pud & PTE_MASK);
+	pmd = pmdpage[pmd_idx];
+
+	if (!(pmd & _PAGE_PRESENT))
+		goto out;
+
+	idx = pte_index(vaddr);
+	ptepage = __va(pmd & PTE_MASK);
+	pte = ptepage[idx];
+
+	if (!(pte & _PAGE_PRESENT))
+		goto out;
+
+	/* If the guest is trying to touch HV area, kill it! */
+	if (is_hv_page(pgd_idx, pud_idx, pmd_idx, idx)) {
+		kill_guest_dump(vcpu, "guest trying to write to HV area\n");
+		goto out;
+	}
+
+	release_pte(ptepage[idx]);
+	/* FIXME: what about the hash?? */
+
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+static void flush_user_mappings(struct lguest_guest_info *linfo, struct lguest_pgd *pgdir)
+{
+	unsigned int i;
+	for (i = 0; i < pgd_index(linfo->page_offset); i++)
+		release_pgd(pgdir->pgdir, i);
+}
+
+static struct lguest_pgd *new_pgdir(struct lguest_guest_info *linfo, u64 cr3)
+{
+	unsigned int next;
+	unsigned int i;
+
+	next = random32() % LGUEST_PGDIRS;
+	for (i=(next+1) % LGUEST_PGDIRS; i != next; i = (i+1) % LGUEST_PGDIRS) {
+		if (linfo->pgdirs[i].flags & LGUEST_PGD_BUSY_FL)
+			continue;
+		break;
+	}
+	BUG_ON(linfo->pgdirs[i].flags & LGUEST_PGD_BUSY_FL);
+
+	next = i;
+
+	linfo->pgdirs[next].cr3 = cr3;
+	if (!linfo->pgdirs[next].pgdir) {
+		linfo->pgdirs[next].pgdir = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!linfo->pgdirs[next].pgdir)
+			return NULL;
+		/* all kernel pages are the same */
+		for (i=pgd_index(linfo->page_offset); i < PTRS_PER_PGD; i++)
+			linfo->pgdirs[next].pgdir[i] = linfo->kpgdir.pgdir[i];
+	} else {
+		BUG_ON(!(linfo->pgdirs[next].flags & LGUEST_PGD_LINK_FL));
+		/* Release all the non-kernel mappings. */
+		flush_user_mappings(linfo, &linfo->pgdirs[next]);
+	}
+
+	return &linfo->pgdirs[next];
+}
+
+void guest_new_pagetable(struct lguest_vcpu *vcpu, u64 pgtable)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pgd *newpgdir;
+
+	mutex_lock(&linfo->page_lock);
+	newpgdir = find_pgd(linfo, pgtable);
+	if (vcpu->pgdir) {
+		if (!(--vcpu->pgdir->count))
+			vcpu->pgdir->flags &= ~(LGUEST_PGD_BUSY_FL);
+	}
+	if (!newpgdir)
+		newpgdir = new_pgdir(linfo, pgtable);
+	if (!newpgdir) {
+		kill_guest_dump(vcpu, "no more pgd's available!\n");
+		goto out;
+	}
+	vcpu->pgdir = newpgdir;
+	if (!vcpu->pgdir->count++)
+		vcpu->pgdir->flags |= LGUEST_PGD_BUSY_FL;
+	vcpu->regs.cr3 = __pa(vcpu->pgdir->pgdir);
+	if (!(vcpu->pgdir->flags & LGUEST_PGD_LINK_FL)) {
+		list_add(&vcpu->pgdir->list, &linfo->pgd_list);
+		vcpu->pgdir->flags |= LGUEST_PGD_LINK_FL;
+	}
+//	pin_stack_pages(lg);
+out:
+	mutex_unlock(&linfo->page_lock);
+}
+
+static void release_all_pagetables(struct lguest_guest_info *linfo)
+{
+	struct lguest_pgd *pgdir, *next;
+	int i;
+
+	/* We share the kernel pages, so do them once */
+	for (i=0; i < PTRS_PER_PGD; i++)
+		release_pgd(linfo->kpgdir.pgdir, i);
+
+	list_for_each_entry(pgdir, &linfo->pgd_list, list) {
+		if (pgdir->pgdir)
+			for (i=0; i < pgd_index(linfo->page_offset); i++)
+				release_pgd(pgdir->pgdir, i);
+	}
+	/* now release any pgdirs that are not busy */
+	list_for_each_entry_safe(pgdir, next, &linfo->pgd_list, list) {
+		if (!(pgdir->flags & LGUEST_PGD_BUSY_FL)) {
+			BUG_ON(pgdir->count);
+			pgdir->flags &= ~LGUEST_PGD_LINK_FL;
+			list_del(&pgdir->list);
+			free_page((u64)pgdir->pgdir);
+			pgdir->cr3 = 0;
+			pgdir->pgdir = NULL;
+		}
+	}
+}
+
+void guest_pagetable_clear_all(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+
+	mutex_lock(&linfo->page_lock);
+	release_all_pagetables(linfo);
+//	pin_stack_pages(lg);
+	mutex_unlock(&linfo->page_lock);
+}
+
+void guest_pagetable_flush_user(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	unsigned int i;
+
+	for (i = 0; i < pgd_index(linfo->page_offset); i++)
+		release_pgd(vcpu->pgdir->pgdir, i);
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+   swapped.  It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return 0 if failed, 1 if good */
+static int page_in(struct lguest_vcpu *vcpu, u64 vaddr, pgprot_t prot)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	struct lguest_pud *pudir;
+	struct lguest_pmd *pmdir;
+	struct lguest_pte *ptedir;
+	u64 val;
+	u64 paddr;
+	u64 gpgd, gpud, gpmd, gpte;
+	u64 flags = pgprot_val(prot);
+	int write;
+	int ret;
+
+	gpgd = gtoplev(vcpu, vaddr);
+	val = lhread_u64(vcpu, gpgd);
+	if (!(val & _PAGE_PRESENT)) {
+		printk("pgd not present pgd:%llx vaddr:%llx val:%llx\n", gpgd, vaddr, val);
+		return 0;
+	}
+
+	gpud = val & PTE_MASK;
+
+	pudir = hash_pud(vcpu, gpud, pgd_index(vaddr));
+	if (!pudir)
+		return 0; /* -ENOMEM */
+
+	if (vaddr >= linfo->page_offset)
+		pudir->flags |= LGUEST_PUD_KERNEL_FL;
+
+	gpud += pud_index(vaddr) * sizeof(u64);
+	val = lhread_u64(vcpu, gpud);
+	if (!(val & _PAGE_PRESENT)) {
+		printk("pud not present?\n");
+		return 0;
+	}
+
+	gpmd = val & PTE_MASK;
+
+	pmdir = hash_pmd(vcpu, pudir, gpmd, pud_index(vaddr));
+	if (!pmdir)
+		return 0; /* -ENOMEM */
+
+	if (vaddr >= linfo->page_offset)
+		pmdir->flags |= LGUEST_PMD_KERNEL_FL;
+
+	gpmd += pmd_index(vaddr) * sizeof(u64);
+	val = lhread_u64(vcpu, gpmd);
+	if (!(val & _PAGE_PRESENT)) {
+		printk("pmd not present?\n");
+		return 0;
+	}
+
+	/* The guest might have set up a 2M page */
+	if (val & (1<<7)) {
+		/* 2M pages */
+		/*
+		 * Although the guest may have mapped this into 2M pages
+		 * we haven't and wont. So we still need to find the 4K
+		 * page position.
+		 */
+		paddr = val & ~((1<<20)-1);
+		paddr += pte_index(vaddr) << PAGE_SHIFT;
+		paddr &= PTE_MASK; /* can still have the NX bit set */
+	} else {
+		/* 4K pages */
+		gpte = val & PTE_MASK;
+
+		ptedir = hash_pte(vcpu, pmdir, gpte, pmd_index(vaddr));
+		if (!ptedir)
+			return 0; /* -ENOMEM */
+
+		gpte += pte_index(vaddr) * sizeof(u64);
+		val = lhread_u64(vcpu, gpte);
+		if (!(val & _PAGE_PRESENT) || ((flags & _PAGE_DIRTY) && !(val & _PAGE_RW))) {
+			printk("pte not present or dirty?\n");
+			return 0;
+		}
+		/* this is the guest's paddr */
+		paddr = val & PTE_MASK;
+
+	}
+
+	/* FIXME: check these values */
+
+	/*
+	 * FIXME: if this isn't write, we lose the lguest_data when we do
+	 *  a put_user in the hypercall init.
+	 */
+	write = 1; // val & _PAGE_DIRTY ? 1 : 0;
+
+	val = get_pfn(paddr >> PAGE_SHIFT, write);
+	if (val == (unsigned long)-1UL) {
+		printk("bad 1\n");
+		kill_guest_dump(vcpu, "page %llx not mapped", paddr);
+		return 0;
+	}
+
+	/* now we have the actual paddr */
+	val <<= PAGE_SHIFT;
+
+	ret = __lguest_map_guest_page(vcpu->guest, vcpu->pgdir->pgdir,
+				      vaddr, val, __pgprot(flags));
+	if (ret < 0) {
+		printk("bad 2\n");
+		kill_guest_dump(vcpu, "can't map page");
+		return 0;
+	}
+	return 1;
+}
+
+int demand_page(struct lguest_vcpu *vcpu, u64 vaddr, int write)
+{
+	return page_in(vcpu, vaddr, (write ? PAGE_SHARED_EXEC : PAGE_COPY_EXEC));
+}
+
+
+static pud_t *pud_from_index(unsigned long addr, unsigned index)
+{
+	pud_t *pud = (pud_t*)addr;
+
+	return &pud[index];
+}
+
+static pmd_t *pmd_from_index(unsigned long addr, unsigned index)
+{
+	pmd_t *pmd = (pmd_t*)addr;
+
+	return &pmd[index];
+}
+
+static pte_t *pte_from_index(unsigned long addr, unsigned index)
+{
+	pte_t *pte = (pte_t*)addr;
+
+	return &pte[index];
+}
+
+static int __lguest_map_guest_pte(pmd_t *pmd, unsigned long vaddr,
+				  unsigned long paddr, pgprot_t prot)
+{
+	unsigned long page;
+	pte_t *pte;
+	unsigned index;
+
+	page = pmd_page_vaddr(*pmd);
+
+	index = pte_index(vaddr);
+	pte = pte_from_index(page, index);
+	if (pte_val(*pte) & _PAGE_PRESENT &&
+	    pte_val(*pte) == pte_val(pfn_pte(paddr>>PAGE_SHIFT, prot)) ) {
+		printk("stange page faulting!\n");
+		printk("paddr=%lx (paddr)=%lx\n", paddr, *(unsigned long *)__va(paddr));
+		printk("vaddr: %lx pte %x val: %lx\n", vaddr, index, pte_val(*pte));
+	}
+
+	set_pte(pte, mk_pte(pfn_to_page(paddr >> PAGE_SHIFT), prot));
+
+	return 0;
+}
+
+static int __lguest_map_guest_pmd(pud_t *pud, unsigned long vaddr, unsigned long paddr,
+				  pgprot_t prot)
+{
+	unsigned long page;
+	pmd_t *pmd;
+	unsigned index;
+
+	page = pud_page_vaddr(*pud);
+
+	index = pmd_index(vaddr);
+	pmd = pmd_from_index(page, index);
+	if (!pmd_val(*pmd)) {
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(page)));
+	}
+
+	return __lguest_map_guest_pte(pmd, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_pud(pgd_t *pgd, unsigned long vaddr, unsigned long paddr,
+				  pgprot_t prot)
+{
+	unsigned long page;
+	pud_t *pud;
+	unsigned index;
+
+	page = pgd_page_vaddr(*pgd);
+
+	index = pud_index(vaddr);
+	pud = pud_from_index(page, index);
+	if (!pud_val(*pud)) {
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_pud(pud, __pud(_PAGE_TABLE | __pa(page)));
+	}
+
+	return __lguest_map_guest_pmd(pud, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_pgd(u64 *cr3,
+				  unsigned long vaddr, unsigned long paddr,
+				  pgprot_t prot)
+{
+	unsigned long page;
+	unsigned index;
+	pgd_t *pgd;
+
+	index = pgd_index(vaddr);
+	pgd = (pgd_t*)&cr3[index];
+	if (!pgd_val(*pgd)) {
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(page)));
+	}
+
+	return __lguest_map_guest_pud(pgd, vaddr, paddr, prot);
+}
+
+static int __lguest_map_guest_page(struct lguest_guest_info *linfo, u64 *cr3,
+				   unsigned long vaddr, unsigned long paddr,
+				   pgprot_t prot)
+{
+	int ret;
+
+	ret = __lguest_map_guest_pgd(cr3, vaddr, paddr, prot);
+	if (ret < 0)
+		return ret;
+
+	/* All guest kernel pages are the same */
+	if (vaddr >= linfo->page_offset) {
+		struct lguest_pgd *pgdir;
+		unsigned index;
+		pgd_t *pgd;
+		u64 val;
+
+		index = pgd_index(vaddr);
+		pgd = (pgd_t*)&cr3[index];
+		val = pgd_val(*pgd);
+
+		list_for_each_entry(pgdir, &linfo->pgd_list, list)
+			pgdir->pgdir[index] = val;
+	}
+	return ret;
+}
+
+static void __lguest_unmap_page_pmd(pmd_t *pmd, unsigned long vaddr)
+{
+	pte_t *pte;
+	unsigned index;
+	unsigned long page;
+
+	page = pmd_page_vaddr(*pmd);
+
+	index = pte_index(vaddr);
+	pte = pte_from_index(page, index);
+	if (pte_val(*pte) & 1)
+		set_pte(pte, __pte(0));
+}
+
+static void __lguest_unmap_page_pud(pud_t *pud, unsigned long vaddr)
+{
+	pmd_t *pmd;
+	unsigned index;
+	unsigned long page;
+
+	page = pud_page_vaddr(*pud);
+
+	index = pmd_index(vaddr);
+	pmd = pmd_from_index(page, index);
+	if (pmd_val(*pmd) & 1)
+		__lguest_unmap_page_pmd(pmd, vaddr);
+}
+
+static void __lguest_unmap_page_pgd(pgd_t *pgd, unsigned long vaddr)
+{
+	pud_t *pud;
+	unsigned index;
+	unsigned long page;
+
+	page = pgd_page_vaddr(*pgd);
+
+	index = pud_index(vaddr);
+	pud = pud_from_index(page, index);
+	if (pud_val(*pud) & 1)
+		__lguest_unmap_page_pud(pud, vaddr);
+}
+
+static void __lguest_unmap_guest_page(struct lguest_guest_info *linfo,
+				      unsigned long vaddr)
+{
+	pgd_t *pgd;
+	unsigned index;
+	u64 *cr3 = linfo->kpgdir.pgdir;
+
+	if (!cr3)
+		return;
+
+	index = pgd_index(vaddr);
+	pgd = (pgd_t*)&cr3[index];
+	if (!(pgd_val(*pgd)&1))
+		return;
+
+	__lguest_unmap_page_pgd(pgd, vaddr);
+}
+
+int lguest_map_hv_pages(struct lguest_guest_info *lguest,
+			unsigned long vaddr, int pages,
+			pgprot_t *pprot)
+{
+	unsigned long page;
+	int i;
+	int ret;
+	pgprot_t prot;
+
+	ret = -ENOMEM;
+	for (i=0; i < pages; i++) {
+		/* now add the page we want */
+		page = hvvm_get_actual_phys((void*)vaddr+PAGE_SIZE*i, &prot);
+		if (!page)
+			goto failed;
+
+		if (pprot)
+			prot = *pprot;
+		ret = __lguest_map_guest_page(lguest, lguest->kpgdir.pgdir,
+					      vaddr+PAGE_SIZE*i, page, prot);
+		if (ret < 0)
+			goto failed;
+	}
+	return 0;
+failed:
+	for (--i; i >= 0; i--)
+		__lguest_unmap_guest_page(lguest, vaddr+PAGE_SIZE*i);
+	return ret;
+}
+
+/**
+ * lguest_mem_addr - retrieve page that's mapped from guest.
+ * @vcpu: lguest vcpu descriptor.
+ * @addr: address to get from the guest's address space.
+ *
+ *  ONLY USE WHEN ALL ELSE FAILS!
+ */
+void *lguest_mem_addr(struct lguest_vcpu *vcpu, u64 addr)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+	u64 *cr3 = linfo->kpgdir.pgdir;
+	unsigned long page;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned index = pgd_index(addr);
+
+	pgd = (pgd_t*)&cr3[index];
+	if (!(pgd_val(*pgd) & 1))
+		return NULL;
+
+	page = pgd_page_vaddr(*pgd);
+	index = pud_index(addr);
+	pud = pud_from_index(page, index);
+	if (!(pud_val(*pud) & 1))
+		return NULL;
+
+	page = pud_page_vaddr(*pud);
+	index = pmd_index(addr);
+	pmd = pmd_from_index(page, index);
+	if (!(pmd_val(*pmd) & 1))
+		return NULL;
+
+	page = pmd_page_vaddr(*pmd);
+	index = pte_index(addr);
+	pte = pte_from_index(page, index);
+	if (!(pte_val(*pte) & 1))
+		return NULL;
+
+	page = ((pte_val(*pte) & PAGE_MASK) + (addr & (PAGE_SIZE-1)));
+
+	return (void *)(page + PAGE_OFFSET);
+}
+
+void __lguest_free_guest_pmd(pmd_t *pmd)
+{
+	pte_t *pte;
+	unsigned long page;
+	int i;
+
+	page = pmd_page_vaddr(*pmd);
+
+	for (i=0; i < PTRS_PER_PTE; i++) {
+		pte = pte_from_index(page, i);
+		if (!(pte_val(*pte) & 1))
+			continue;
+		/* FIXME: do some checks here??? */
+	}
+	set_pmd(pmd, __pmd(0));
+	free_page(page);
+}
+
+void __lguest_free_guest_pud(pud_t *pud)
+{
+	pmd_t *pmd;
+	unsigned long page;
+	int i;
+
+	page = pud_page_vaddr(*pud);
+
+	for (i=0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_from_index(page, i);
+		if (!(pmd_val(*pmd) & 1))
+			continue;
+		__lguest_free_guest_pmd(pmd);
+	}
+	set_pud(pud, __pud(0));
+	free_page(page);
+}
+
+void __lguest_free_guest_pgd(pgd_t *pgd)
+{
+	pud_t *pud;
+	unsigned long page;
+	int i;
+
+	page = pgd_page_vaddr(*pgd);
+
+	for (i=0; i < PTRS_PER_PUD; i++) {
+		pud = pud_from_index(page, i);
+		if (!(pud_val(*pud) & 1))
+			continue;
+		__lguest_free_guest_pud(pud);
+	}
+	set_pgd(pgd, __pgd(0));
+	free_page(page);
+}
+
+void __lguest_free_guest_pages(u64 *cr3)
+{
+	pgd_t *pgd;
+	int i;
+
+	if (!cr3)
+		return;
+
+	for (i=0; i < PTRS_PER_PGD; i++) {
+		pgd = (pgd_t*)&cr3[i];
+		if (!(pgd_val(*pgd) & 1))
+			continue;
+		__lguest_free_guest_pgd(pgd);
+	}
+	free_page((u64)cr3);
+}
+
+void __lguest_free_guest_upages(struct lguest_guest_info *linfo, u64 *cr3)
+{
+	pgd_t *pgd;
+	int i;
+
+	if (!cr3)
+		return;
+
+	for (i=0; i < pgd_index(linfo->page_offset); i++) {
+		pgd = (pgd_t*)&cr3[i];
+		if (!(pgd_val(*pgd) & 1))
+			continue;
+		__lguest_free_guest_pgd(pgd);
+	}
+	free_page((u64)cr3);
+}
+
+void lguest_free_guest_pages(struct lguest_guest_info *linfo)
+{
+	int i;
+
+	/* This frees all the guest kernel pages */
+	__lguest_free_guest_pages(linfo->kpgdir.pgdir);
+
+	for (i=0; i < LGUEST_PGDIRS; i++)
+		__lguest_free_guest_upages(linfo, linfo->pgdirs[i].pgdir);
+}
+
+void lguest_unmap_guest_pages(struct lguest_guest_info *lguest,
+			     unsigned long vaddr, int pages)
+{
+	int i;
+
+	for (i=0; i < pages; i++)
+		__lguest_unmap_guest_page(lguest, vaddr+PAGE_SIZE*i);
+}
+
+int lguest_init_vcpu_pagetable(struct lguest_vcpu *vcpu)
+{
+	struct lguest_guest_info *linfo = vcpu->guest;
+
+	mutex_lock(&linfo->page_lock);
+	vcpu->pgdir = new_pgdir(linfo, linfo->kpgdir.cr3);
+	BUG_ON(!vcpu->pgdir);
+	if (!vcpu->pgdir->count++)
+		vcpu->pgdir->flags |= LGUEST_PGD_BUSY_FL;
+	list_add(&vcpu->pgdir->list, &linfo->pgd_list);
+	mutex_unlock(&linfo->page_lock);
+
+	return 0;
+}
+
+int init_guest_pagetable(struct lguest_guest_info *linfo, u64 pgtable)
+{
+	int ret = -ENOMEM;
+
+	linfo->kpgdir.cr3 = pgtable;
+	linfo->kpgdir.pgdir = (u64*)get_zeroed_page(GFP_KERNEL);
+	if (!linfo->kpgdir.pgdir)
+		return -ENOMEM;
+	linfo->kpgdir.flags |= LGUEST_PGD_BUSY_FL | LGUEST_PGD_MASTER_FL;
+	linfo->kpgdir.count = -1;
+
+	/*
+	 * The list is used to update all the kernel page tables,
+	 * so that they all have the same mappings.
+	 */
+	list_add(&linfo->kpgdir.list, &linfo->pgd_list);
+
+	ret = lguest_map_hv_pages(linfo, lguest_hv_addr,
+				  lguest_hv_pages, NULL);
+	if (ret < 0)
+		goto out;
+
+	return 0;
+ out:
+	free_page((u64)linfo->kpgdir.pgdir);
+
+	return ret;
+}
+
Index: work-pv/arch/x86_64/Makefile
===================================================================
--- work-pv.orig/arch/x86_64/Makefile
+++ work-pv/arch/x86_64/Makefile
@@ -84,6 +84,7 @@ core-y					+= arch/x86_64/kernel/ \
 core-$(CONFIG_IA32_EMULATION)		+= arch/x86_64/ia32/
 drivers-$(CONFIG_PCI)			+= arch/x86_64/pci/
 drivers-$(CONFIG_OPROFILE)		+= arch/x86_64/oprofile/
+drivers-$(CONFIG_LGUEST_GUEST)		+= arch/x86_64/lguest/
 
 boot := arch/x86_64/boot
 
Index: work-pv/include/asm-x86_64/lguest.h
===================================================================
--- /dev/null
+++ work-pv/include/asm-x86_64/lguest.h
@@ -0,0 +1,350 @@
+#ifndef _LGUEST_H_
+#define _LGUEST_H_
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include <linux/futex.h>
+#include <asm/lguest_user.h>
+
+/* XXX: Come up with better magic later on */
+#define LGUEST_MAGIC_R13 0x1
+#define LGUEST_MAGIC_R14 0x2
+#define LGUEST_MAGIC_R15 0x3
+
+#define LGUEST_MAX_VCPUS 64
+
+#define LGUEST_PGDS_PER_VCPU 8
+#define LGUEST_PGDIRS (LGUEST_MAX_VCPUS * LGUEST_PGDS_PER_VCPU)
+
+#define LGUEST_IRQS 32
+
+#define LHCALL_FLUSH_ASYNC	0
+#define LHCALL_LGUEST_INIT	1
+#define LHCALL_CRASH		2
+#define LHCALL_LOAD_GDT		3
+#define LHCALL_NEW_PGTABLE	4
+#define LHCALL_FLUSH_TLB	5
+#define LHCALL_LOAD_IDT_ENTRY	6
+#define LHCALL_SET_STACK	7
+#define LHCALL_TS		8
+#define LHCALL_TIMER_READ	9
+#define LHCALL_TIMER_START	10
+#define LHCALL_HALT		11
+#define LHCALL_GET_WALLCLOCK	12
+#define LHCALL_BIND_DMA		13
+#define LHCALL_SEND_DMA		14
+#define LHCALL_FLUSH_TLB_SIG	15
+#define LHCALL_SET_PTE		16
+#define LHCALL_SET_PMD		17
+#define LHCALL_SET_PUD		18
+#define LHCALL_SET_PGD		19
+#define LHCALL_CLEAR_PTE	20
+#define LHCALL_CLEAR_PMD	21
+#define LHCALL_CLEAR_PUD	22
+#define LHCALL_CLEAR_PGD	23
+#define LHCALL_LOAD_TLS		24
+#define LHCALL_RDMSR		25
+#define LHCALL_WRMSR		26
+#define LHCALL_IRET		27
+
+#define LHCALL_PRINT		60
+#define LHCALL_DEBUG_ME		99
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+		     : "=a"(call)
+		     : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
+		     : "memory");
+	return call;
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+struct lguest_vcpu;
+
+struct lguest_dma_info
+{
+	struct list_head list;
+	union futex_key key;
+	unsigned long dmas;
+	u16 next_dma;
+	u16 num_dmas;
+	u32 guest_id;
+	u8 interrupt; 	/* 0 when not registered */
+};
+
+
+/* these must be powers of two */
+#define PUD_HASH_SIZE 256
+#define PMD_HASH_SIZE 256
+#define PTE_HASH_SIZE 256
+
+#define LGUEST_PGD_BUSY_FL	(1<<0)
+#define LGUEST_PGD_MASTER_FL	(1<<1)
+#define LGUEST_PGD_LINK_FL	(1<<2)
+
+#define LGUEST_PUD_KERNEL_FL	(1<<1)
+#define LGUEST_PMD_KERNEL_FL	(1<<1)
+#define LGUEST_PTE_KERNEL_FL	(1<<1)
+
+struct lguest_pgd {
+	struct list_head list;
+	u64 cr3;
+	u64 *pgdir;
+	u64 *user_pgdir;
+	unsigned count;
+	unsigned flags;
+};
+
+struct lguest_pud {
+	struct list_head list;
+	struct lguest_pgd *pgdir;
+	u64 gpud;  /* guest pud */
+	unsigned flags;
+	unsigned idx;
+};
+
+struct lguest_pmd {
+	struct list_head list;
+	struct lguest_pud *pudir;
+	u64 gpmd;  /* guest pmd */
+	unsigned flags;
+	unsigned idx;
+};
+
+struct lguest_pte {
+	struct list_head list;
+	struct lguest_pmd *pmdir;
+	u64 gpte;  /* guest pte */
+	unsigned flags;
+	unsigned idx;
+};
+
+struct lguest_guest_info {
+	struct lguest_data __user *lguest_data;
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	u32 guest_id;
+	u64 pfn_limit;
+	u64 start_kernel_map;
+	u64 page_offset;
+
+	int halted;
+	/* does it really belong here? */
+	char *dead;
+#if 0
+	unsigned long noirq_start, noirq_end;
+#endif
+	int dma_is_pending;
+	unsigned long pending_dma; /* struct lguest_dma */
+	unsigned long pending_addr; /* address they're sending to */
+
+	struct lguest_pgd kpgdir;
+	struct lguest_pgd pgdirs[LGUEST_PGDIRS];
+	struct list_head pgd_list;
+	struct list_head pud_hash[PUD_HASH_SIZE];
+	struct list_head pmd_hash[PMD_HASH_SIZE];
+	struct list_head pte_hash[PTE_HASH_SIZE];
+	struct mutex page_lock;
+
+	int timer_on;
+	int last_timer;
+
+	/* Cached wakeup: we hold a reference to this task. */
+	struct task_struct *wake;
+
+	struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+	struct lguest_vcpu *vcpu[LGUEST_MAX_VCPUS];
+};
+
+/* copied from old lguest code. Not sure if it's the best layout for us */
+struct lguest_regs
+{
+	u64 cr3;			/*   0 ( 0x0) */
+        /* Manually saved part. */
+        u64 rbx, rcx, rdx;		/*   8 ( 0x8) */
+        u64 rsi, rdi, rbp;		/*  32 (0x20) */
+        u64 r8, r9, r10, r11;		/*  56 (0x38) */
+        u64 r12, r13, r14, r15;		/*  88 (0x58) */
+        u64 rax;			/* 120 (0x78) */
+        u64 fs; /* ds; */		/* 128 (0x80) */
+        u64 trapnum, errcode;		/* 136 (0x88) */
+        /* Trap pushed part */
+        u64 rip;			/* 152 (0x98) */
+        u64 cs;				/* 160 (0xa0) */
+        u64 rflags;			/* 168 (0xa8) */
+        u64 rsp;			/* 176 (0xb0) */
+	u64 ss; /* Crappy Segment! */	/* 184 (0xb8) */
+	/* size = 192  (0xc0) */
+	char size[0];
+};
+
+struct lguest_tss_struct {
+	u32 reserved1;
+	u64 rsp0;
+	u64 rsp1;
+	u64 rsp2;
+	u64 reserved2;
+	u64 ist[7];
+	u32 reserved3;
+	u32 reserved4;
+	u16 reserved5;
+	u16 io_bitmap_base;
+	/* we don't let the guest have io privileges (yet) */
+	unsigned long io_bitmap[1];
+} __attribute__((packed)) ____cacheline_aligned;
+
+struct lguest_vcpu {
+	unsigned long host_syscall;
+	unsigned long guest_syscall;
+
+	/* Must be 16 bytes aligned at regs+sizeof(regs) */
+	struct lguest_regs regs;
+
+	struct lguest_vcpu *vcpu; /* pointer to itself */
+	unsigned long debug;
+	unsigned long magic;
+	unsigned int  id;
+	unsigned long host_stack;
+	unsigned long guest_stack;
+	unsigned long host_cr3;
+	unsigned long host_page;
+	struct desc_ptr host_gdt;
+	u16 host_gdt_buff[3];
+	struct desc_ptr host_idt;
+	u16 host_idt_buff[3];
+	unsigned long host_gdt_ptr;
+	/* Save rax on interrupts, it's used for iret hcall */
+	unsigned long rax;
+
+	/* Host save gs base pointer */
+	unsigned long host_gs_a;
+	unsigned long host_gs_d;
+
+	/* save host process gs base pointer */
+	unsigned long host_proc_gs_a;
+	unsigned long host_proc_gs_d;
+
+	/* save guest gs base pointer */
+	unsigned long guest_gs_a;
+	unsigned long guest_gs_d;
+
+	/* used for guest calling swapgs */
+	unsigned long guest_gs_shadow_a;
+	unsigned long guest_gs_shadow_d;
+
+	struct lguest_pgd *pgdir;
+
+	struct desc_ptr gdt; /* address of the GDT at this vcpu */
+	u16 gdt_buff[3];
+	struct desc_struct gdt_table[GDT_ENTRIES];
+
+	struct desc_ptr idt; /* address of the IDT at this vcpu */
+	u16 idt_buff[3];
+	struct gate_struct idt_table[IDT_ENTRIES];
+
+	struct lguest_guest_info *guest;
+
+	struct lguest_tss_struct tss;
+
+	unsigned long ts;
+
+	/* host ist 7 - we use it to prevent the NMI race */
+	unsigned long host_ist;
+
+	/* only for those above FIRST_EXTERNAL_VECTOR */
+	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+	/* those are general. We catch every possible interrupt */
+	DECLARE_BITMAP(interrupt_disabled, LGUEST_IRQS + FIRST_EXTERNAL_VECTOR);
+	unsigned long interrupt[LGUEST_IRQS + FIRST_EXTERNAL_VECTOR];
+
+	/* nmi trampoline storage */
+
+	struct lguest_regs nmi_regs;
+	unsigned long nmi_gs_a;
+	unsigned long nmi_gs_d;
+	unsigned long nmi_gs_shadow_a;
+	unsigned long nmi_gs_shadow_d;
+	struct desc_ptr nmi_gdt;
+	u16 nmi_gdt_buff[3];
+
+	/* set when we take an nmi */
+	unsigned long nmi_sw;
+
+	/* is this enough? */
+	char nmi_stack[1048];
+	char nmi_stack_end[0];
+	char gpf_stack[1048];
+	char gpf_stack_end[0];
+	char df_stack[1048];
+	char df_stack_end[0];
+};
+
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+	u32 eax, edx, ebx, ecx;
+};
+
+struct lguest_text_ptr {
+	unsigned long next; /* guest pa address of next pointer */
+	unsigned long start;
+	unsigned long end;
+};
+
+struct lguest_data
+{
+/* Fields which change during running: */
+	/* 512 == enabled (same as eflags) */
+	unsigned int irq_enabled;
+	/* Blocked interrupts. */
+	DECLARE_BITMAP(interrupts, LGUEST_IRQS);
+
+	/* Last (userspace) address we got a GPF & reloaded gs. */
+	unsigned int gs_gpf_eip;
+
+	/* Virtual address of page fault. */
+	unsigned long cr2;
+
+	/* Async hypercall ring.  0xFF == done, 0 == pending. */
+	u8 hcall_status[LHCALL_RING_SIZE];
+	struct hcall_ring hcalls[LHCALL_RING_SIZE];
+
+/* Fields initialized by the hypervisor at boot: */
+	/* Memory not to try to access */
+	unsigned long reserve_mem;
+	/* ID of this guest (used by network driver to set ethernet address) */
+	u32 guest_id;
+
+/* Fields initialized by the guest at boot: */
+	/* Instruction range to suppress interrupts even if enabled */
+#if 0
+	unsigned long noirq_start, noirq_end;
+#endif
+	unsigned long start_kernel_map;
+	unsigned long page_offset;
+	unsigned long text; /* pa address of lguest_text_ptr addresses */
+
+/* If the kernel has kallsyms, we can use it to do backtraces of a guest */
+	unsigned long kallsyms_addresses;
+	unsigned long kallsyms_num_syms;
+	unsigned long kallsyms_names;
+	unsigned long kallsyms_token_table;
+	unsigned long kallsyms_token_index;
+	unsigned long kallsyms_markers;
+
+	unsigned long return_address;
+};
+
+extern struct lguest_data lguest_data;
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+int run_guest(struct lguest_vcpu *vcpu, char *__user user);
+
+#endif

--

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxx
https://lists.osdl.org/mailman/listinfo/virtualization