[PATCH 6/10] lguest code: the little linux hypervisor.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is the core of lguest: both the guest code (always compiled in to
the image so it can boot under lguest), and the host code (lg.ko).

There is only one config prompt at the moment: lguest is currently
designed to run exactly the same guest and host kernels so we can
frob the ABI freely.

Unfortunately, we don't have the build infrastructure for "private"
asm-offsets.h files, so there's a not-so-neat include in
arch/i386/kernel/asm-offsets.c.

Signed-off-by: Rusty Russell <rusty at rustcorp.com.au>

===================================================================
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -226,6 +226,27 @@ config ES7000_CLUSTERED_APIC
 	depends on SMP && X86_ES7000 && MPENTIUMIII
 
 source "arch/i386/Kconfig.cpu"
+
+config LGUEST
+	tristate "Linux hypervisor example code"
+	depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE
+	select LGUEST_GUEST
+	select HVC_DRIVER
+	---help---
+	  This is a very simple module which allows you to run
+	  multiple instances of the same Linux kernel, using the
+	  "lguest" command found in the Documentation/lguest directory.
+	  Note that "lguest" is pronounced to rhyme with "fell quest",
+	  not "rustyvisor".  See Documentation/lguest/lguest.txt.
+
+	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
+
+config LGUEST_GUEST
+	bool
+	help
+	  The guest needs code built-in, even if the host has lguest
+	  support as a module.  The drivers are tiny, so we build them
+	  in too.
 
 config HPET_TIMER
 	bool "HPET Timer Support"
===================================================================
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -108,6 +108,7 @@ drivers-$(CONFIG_PCI)			+= arch/i386/pci
 # must be linked after kernel/
 drivers-$(CONFIG_OPROFILE)		+= arch/i386/oprofile/
 drivers-$(CONFIG_PM)			+= arch/i386/power/
+drivers-$(CONFIG_LGUEST_GUEST)		+= arch/i386/lguest/
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -16,6 +16,10 @@
 #include <asm/thread_info.h>
 #include <asm/elf.h>
 #include <asm/pda.h>
+#ifdef CONFIG_LGUEST_GUEST
+#include <asm/lguest.h>
+#include "../lguest/lg.h"
+#endif
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -111,4 +115,19 @@ void foo(void)
 	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
 	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
 #endif
+
+#ifdef CONFIG_LGUEST_GUEST
+	BLANK();
+	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_STATE_host_stackptr, lguest_state, host.stackptr);
+	OFFSET(LGUEST_STATE_host_pgdir, lguest_state, host.pgdir);
+	OFFSET(LGUEST_STATE_host_gdt, lguest_state, host.gdt);
+	OFFSET(LGUEST_STATE_host_idt, lguest_state, host.idt);
+	OFFSET(LGUEST_STATE_regs, lguest_state, regs);
+	OFFSET(LGUEST_STATE_gdt, lguest_state, gdt);
+	OFFSET(LGUEST_STATE_idt, lguest_state, idt);
+	OFFSET(LGUEST_STATE_gdt_table, lguest_state, gdt_table);
+	OFFSET(LGUEST_STATE_trapnum, lguest_state, regs.trapnum);
+	OFFSET(LGUEST_STATE_errcode, lguest_state, regs.errcode);
+#endif
 }
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/Makefile
@@ -0,0 +1,22 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST)	+= lg.o
+lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
+	segments.o io.o lguest_user.o
+
+# We use top 4MB for guest traps page, then hypervisor. */
+HYPE_ADDR := (0xFFC00000+4096)
+# The data is only 1k (256 interrupt handler pointers)
+HYPE_DATA_SIZE := 1024
+CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)" -DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
+
+$(obj)/core.o: $(obj)/hypervisor-blob.c
+# This links the hypervisor in the right place and turns it into a C array.
+$(obj)/hypervisor-raw: $(obj)/hypervisor.o
+	@$(LD) -static -Tdata=`printf %#x $$(($(HYPE_ADDR)))` -Ttext=`printf %#x $$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O binary $@
+$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw
+	@od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e 's/ /,0x/g' > $@
+
+clean-files := hypervisor-blob.c hypervisor-raw
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/core.c
@@ -0,0 +1,425 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future.  Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/lguest.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+/* This is our hypervisor, compiled from hypervisor.S. */
+static char __initdata hypervisor_blob[] = {
+#include "hypervisor-blob.c"
+};
+
+#define MAX_LGUEST_GUESTS \
+	((HYPERVISOR_SIZE-sizeof(hypervisor_blob))/sizeof(struct lguest_state))
+
+static struct vm_struct *hypervisor_vma;
+static int cpu_had_pge;
+static struct {
+	unsigned long offset;
+	unsigned short segment;
+} lguest_entry;
+struct page *hype_pages; /* Contiguous pages. */
+struct lguest lguests[MAX_LGUEST_GUESTS];
+DECLARE_MUTEX(lguest_lock);
+
+/* IDT entries are at start of hypervisor. */
+const unsigned long *__lguest_default_idt_entries(void)
+{
+	return (void *)HYPE_ADDR;
+}
+
+/* Next is switch_to_guest */
+static void *__lguest_switch_to_guest(void)
+{
+	return (void *)HYPE_ADDR + HYPE_DATA_SIZE;
+}
+
+/* Then we use everything else to hold guest state. */
+struct lguest_state *__lguest_states(void)
+{
+	return (void *)HYPE_ADDR + sizeof(hypervisor_blob);
+}
+
+static __init int map_hypervisor(void)
+{
+	unsigned int i;
+	int err;
+	struct page *pages[HYPERVISOR_PAGES], **pagep = pages;
+
+	hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO,
+				 get_order(HYPERVISOR_SIZE));
+	if (!hype_pages)
+		return -ENOMEM;
+
+	hypervisor_vma = __get_vm_area(HYPERVISOR_SIZE, VM_ALLOC,
+				       HYPE_ADDR, VMALLOC_END);
+	if (!hypervisor_vma) {
+		err = -ENOMEM;
+		printk("lguest: could not map hypervisor pages high\n");
+		goto free_pages;
+	}
+
+	for (i = 0; i < HYPERVISOR_PAGES; i++)
+		pages[i] = hype_pages + i;
+
+	err = map_vm_area(hypervisor_vma, PAGE_KERNEL, &pagep);
+	if (err) {
+		printk("lguest: map_vm_area failed: %i\n", err);
+		goto free_vma;
+	}
+	memcpy(hypervisor_vma->addr, hypervisor_blob, sizeof(hypervisor_blob));
+
+	/* Setup LGUEST segments on all cpus */
+	for_each_possible_cpu(i) {
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+	}
+
+	/* Initialize entry point into hypervisor. */
+	lguest_entry.offset = (long)__lguest_switch_to_guest();
+	lguest_entry.segment = LGUEST_CS;
+
+	printk("lguest: mapped hypervisor at %p\n", hypervisor_vma->addr);
+	return 0;
+
+free_vma:
+	vunmap(hypervisor_vma->addr);
+free_pages:
+	__free_pages(hype_pages, get_order(HYPERVISOR_SIZE));
+	return err;
+}
+
+static __exit void unmap_hypervisor(void)
+{
+	vunmap(hypervisor_vma->addr);
+	__free_pages(hype_pages, get_order(HYPERVISOR_SIZE));
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+	unsigned long physaddr = guest_pa(lg, lg->state->regs.eip);
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->state->regs.eip < lg->page_offset)
+		return 0;
+	lhread(lg, &insn, physaddr, 1);
+
+	/* Operand size prefix means it's actually for ax. */
+	if (insn == 0x66) {
+		shift = 16;
+		insnlen = 1;
+		lhread(lg, &insn, physaddr + insnlen, 1);
+	}
+
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		return 0;
+	}
+
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			lg->state->regs.eax = 0xFFFFFFFF;
+		else
+			lg->state->regs.eax |= (0xFFFF << shift);
+	}
+	lg->state->regs.eip += insnlen;
+	return 1;
+}
+
+int find_free_guest(void)
+{
+	unsigned int i;
+	for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+		if (!lguests[i].state)
+			return i;
+	return -1;
+}
+
+int lguest_address_ok(const struct lguest *lg, unsigned long addr)
+{
+	return addr / PAGE_SIZE < lg->pfn_limit;
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lhread_u32(struct lguest *lg, u32 addr)
+{
+	u32 val = 0;
+
+	/* Don't let them access lguest_add */
+	if (!lguest_address_ok(lg, addr)
+	    || get_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad read address %u", addr);
+	return val;
+}
+
+void lhwrite_u32(struct lguest *lg, u32 addr, u32 val)
+{
+	if (!lguest_address_ok(lg, addr)
+	    || put_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad write address %u", addr);
+}
+
+void lhread(struct lguest *lg, void *b, u32 addr, unsigned bytes)
+{
+	if (addr + bytes < addr || !lguest_address_ok(lg, addr+bytes)
+	    || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+		/* copy_from_user should do this, but as we rely on it... */
+		memset(b, 0, bytes);
+		kill_guest(lg, "bad read address %u len %u", addr, bytes);
+	}
+}
+
+void lhwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes)
+{
+	if (addr + bytes < addr
+	    || !lguest_address_ok(lg, addr+bytes)
+	    || copy_to_user((void __user *)addr, b, bytes) != 0)
+		kill_guest(lg, "bad write address %u len %u", addr, bytes);
+}
+
+/* Saves exporting idt_table from kernel */
+static struct desc_struct *get_idt_table(void)
+{
+	struct Xgt_desc_struct idt;
+
+	asm("sidt %0":"=m" (idt));
+	return (void *)idt.address;
+}
+
+extern asmlinkage void math_state_restore(void);
+
+static int usermode(struct lguest_regs *regs)
+{
+	return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+}
+
+/* Trap page resets this when it reloads gs. */
+static int new_gfp_eip(struct lguest *lg, struct lguest_regs *regs)
+{
+	u32 eip;
+	get_user(eip, &lg->lguest_data->gs_gpf_eip);
+	if (eip == regs->eip)
+		return 0;
+	put_user(regs->eip, &lg->lguest_data->gs_gpf_eip);
+	return 1;
+}
+
+static void set_ts(unsigned int guest_ts)
+{
+	u32 cr0;
+	if (guest_ts) {
+		asm("movl %%cr0,%0":"=r" (cr0));
+		if (!(cr0 & 8))
+			asm("movl %0,%%cr0": :"r" (cr0|8));
+	}
+}
+
+static void run_guest_once(struct lguest *lg)
+{
+	unsigned int clobber;
+
+	/* Put eflags on stack, lcall does rest. */
+	asm volatile("pushf; lcall *lguest_entry"
+		     : "=a"(clobber), "=d"(clobber)
+		     : "0"(lg->state), "1"(get_idt_table())
+		     : "memory");
+}
+
+int run_guest(struct lguest *lg, char *__user user)
+{
+	struct lguest_regs *regs = &lg->state->regs;
+
+	while (!lg->dead) {
+		unsigned int cr2 = 0; /* Damn gcc */
+
+		/* Hypercalls first: we might have been out to userspace */
+		if (do_async_hcalls(lg))
+			goto pending_dma;
+
+		if (regs->trapnum == LGUEST_TRAP_ENTRY) {
+			/* Only do hypercall once. */
+			regs->trapnum = 255;
+			if (hypercall(lg, regs))
+				goto pending_dma;
+		}
+
+		if (signal_pending(current))
+			return -EINTR;
+		maybe_do_interrupt(lg);
+
+		if (lg->dead)
+			break;
+
+		if (lg->halted) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(1);
+			continue;
+		}
+
+		/* Restore limits on TLS segments if in user mode. */
+		if (usermode(regs)) {
+			unsigned int i;
+			for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++)
+				lg->state->gdt_table[GDT_ENTRY_TLS_MIN+i].a
+					|= lg->tls_limits[i];
+		}
+
+		local_irq_disable();
+		map_trap_page(lg);
+
+		/* Host state to be restored after the guest returns. */
+		asm("sidt %0":"=m"(lg->state->host.idt));
+		lg->state->host.gdt = __get_cpu_var(cpu_gdt_descr);
+
+		/* Even if *we* don't want FPU trap, guest might... */
+		set_ts(lg->ts);
+
+		run_guest_once(lg);
+
+		/* Save cr2 now if we page-faulted. */
+		if (regs->trapnum == 14)
+			asm("movl %%cr2,%0" :"=r" (cr2));
+		else if (regs->trapnum == 7)
+			math_state_restore();
+		local_irq_enable();
+
+		switch (regs->trapnum) {
+		case 13: /* We've intercepted a GPF. */
+			if (regs->errcode == 0) {
+				if (emulate_insn(lg))
+					continue;
+
+				/* FIXME: If it's reloading %gs in a loop? */
+				if (usermode(regs) && new_gfp_eip(lg,regs))
+					continue;
+			}
+
+			if (reflect_trap(lg, &lg->gpf_trap, 1))
+				continue;
+			break;
+		case 14: /* We've intercepted a page fault. */
+			if (demand_page(lg, cr2, regs->errcode & 2))
+				continue;
+
+			/* If lguest_data is NULL, this won't hurt. */
+			put_user(cr2, &lg->lguest_data->cr2);
+			if (reflect_trap(lg, &lg->page_trap, 1))
+				continue;
+			kill_guest(lg, "unhandled page fault at %#x"
+				   " (eip=%#x, errcode=%#x)",
+				   cr2, regs->eip, regs->errcode);
+			break;
+		case 7: /* We've intercepted a Device Not Available fault. */
+			/* If they don't want to know, just absorb it. */
+			if (!lg->ts) 
+				continue;
+			if (reflect_trap(lg, &lg->fpu_trap, 0))
+				continue;
+			kill_guest(lg, "unhandled FPU fault at %#x",
+				   regs->eip);
+			break;
+		case 32 ... 255: /* Real interrupt, fall thru */
+			cond_resched();
+		case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+			continue;
+		case 6: /* Invalid opcode before they installed handler */
+			check_bug_kill(lg);
+		}
+		kill_guest(lg,"unhandled trap %i at %#x (err=%i)",
+			   regs->trapnum, regs->eip, regs->errcode);
+	}
+	return -ENOENT;
+
+pending_dma:
+	put_user(lg->pending_dma, (unsigned long *)user);
+	put_user(lg->pending_addr, (unsigned long *)user+1);
+	return sizeof(unsigned long)*2;
+}
+
+#define STRUCT_LGUEST_ELEM_SIZE(elem) sizeof(((struct lguest_state *)0)->elem)
+
+static void adjust_pge(void *on)
+{
+	if (on)
+		write_cr4(read_cr4() | X86_CR4_PGE);
+	else
+		write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+ 
+static int __init init(void)
+{
+	int err;
+
+	if (paravirt_enabled())
+		return -EPERM;
+
+	err = map_hypervisor();
+	if (err)
+		return err;
+
+	err = init_pagetables(hype_pages);
+	if (err) {
+		unmap_hypervisor();
+		return err;
+	}
+	lguest_io_init();
+
+	err = lguest_device_init();
+	if (err) {
+		free_pagetables();
+		unmap_hypervisor();
+		return err;
+	}
+	if (cpu_has_pge) { /* We have a broader idea of "global". */
+		cpu_had_pge = 1;
+		on_each_cpu(adjust_pge, 0, 0, 1);
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+	}
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	lguest_device_remove();
+	free_pagetables();
+	unmap_hypervisor();
+	if (cpu_had_pge) {
+		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+	}
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty at rustcorp.com.au>");
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypercalls.c
@@ -0,0 +1,199 @@
+/*  Actual hypercalls, which allow guests to actually do something.
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/clocksource.h>
+#include <asm/lguest.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void guest_set_stack(struct lguest *lg,
+			    u32 seg, u32 esp, unsigned int pages)
+{
+	/* You cannot have a stack segment with priv level 0. */
+	if ((seg & 0x3) != GUEST_DPL)
+		kill_guest(lg, "bad stack segment %i", seg);
+	if (pages > 2)
+		kill_guest(lg, "bad stack pages %u", pages);
+	lg->state->tss.ss1 = seg;
+	lg->state->tss.esp1 = esp;
+	lg->stack_pages = pages;
+	pin_stack_pages(lg);
+}
+
+/* Return true if DMA to host userspace now pending. */
+static int do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+	switch (regs->eax) {
+	case LHCALL_FLUSH_ASYNC:
+		break;
+	case LHCALL_LGUEST_INIT:
+		kill_guest(lg, "already have lguest_data");
+		break;
+	case LHCALL_CRASH: {
+		char msg[128];
+		lhread(lg, msg, regs->edx, sizeof(msg));
+		msg[sizeof(msg)-1] = '\0';
+		kill_guest(lg, "CRASH: %s", msg);
+		break;
+	}
+	case LHCALL_LOAD_GDT:
+		load_guest_gdt(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_NEW_PGTABLE:
+		guest_new_pagetable(lg, regs->edx);
+		break;
+	case LHCALL_FLUSH_TLB:
+		if (regs->edx)
+			guest_pagetable_clear_all(lg);
+		else
+			guest_pagetable_flush_user(lg);
+		break;
+	case LHCALL_LOAD_IDT_ENTRY:
+		load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_SET_STACK:
+		guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_TS:
+		lg->ts = regs->edx;
+		break;
+	case LHCALL_TIMER_READ: {
+		u32 now = jiffies;
+		mb();
+		regs->eax = now - lg->last_timer;
+		lg->last_timer = now;
+		break;
+	}
+	case LHCALL_TIMER_START:
+		lg->timer_on = 1;
+		if (regs->edx != HZ)
+			kill_guest(lg, "Bad clock speed %i", regs->edx);
+		lg->last_timer = jiffies;
+		break;
+	case LHCALL_HALT:
+		lg->halted = 1;
+		break;
+	case LHCALL_GET_WALLCLOCK: {
+		struct timeval tv;
+		do_gettimeofday(&tv);
+		regs->eax = tv.tv_sec;
+		break;
+	}
+	case LHCALL_BIND_DMA:
+		regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+				     regs->ecx >> 8, regs->ecx & 0xFF);
+		break;
+	case LHCALL_SEND_DMA:
+		return send_dma(lg, regs->edx, regs->ebx);
+	case LHCALL_SET_PTE:
+		guest_set_pte(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_SET_UNKNOWN_PTE:
+		guest_pagetable_clear_all(lg);
+		break;
+	case LHCALL_SET_PUD:
+		guest_set_pud(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_TLS:
+		guest_load_tls(lg, (struct desc_struct __user*)regs->edx);
+		break;
+	default:
+		kill_guest(lg, "Bad hypercall %i\n", regs->eax);
+	}
+	return 0;
+}
+
+#define log(...)					\
+	do {						\
+		mm_segment_t oldfs = get_fs();		\
+		char buf[100];				\
+		sprintf(buf, "lguest:" __VA_ARGS__);	\
+		set_fs(KERNEL_DS);			\
+		sys_write(1, buf, strlen(buf));		\
+		set_fs(oldfs);				\
+	} while(0)
+
+/* We always do queued calls before actual hypercall. */
+int do_async_hcalls(struct lguest *lg)
+{
+	unsigned int i, pending;
+	u8 st[LHCALL_RING_SIZE];
+
+	if (!lg->lguest_data)
+		return 0;
+
+	copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st));
+	for (i = 0; i < ARRAY_SIZE(st); i++) {
+		struct lguest_regs regs;
+		unsigned int n = lg->next_hcall;
+
+		if (st[n] == 0xFF)
+			break;
+
+		if (++lg->next_hcall == LHCALL_RING_SIZE)
+			lg->next_hcall = 0;
+
+		get_user(regs.eax, &lg->lguest_data->hcalls[n].eax);
+		get_user(regs.edx, &lg->lguest_data->hcalls[n].edx);
+		get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx);
+		get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx);
+		pending = do_hcall(lg, &regs);
+		put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+		if (pending)
+			return 1;
+	}
+
+	set_wakeup_process(lg, NULL);
+	return 0;
+}
+
+int hypercall(struct lguest *lg, struct lguest_regs *regs)
+{
+	int pending;
+
+	if (!lg->lguest_data) {
+		if (regs->eax != LHCALL_LGUEST_INIT) {
+			kill_guest(lg, "hypercall %i before LGUEST_INIT",
+				   regs->eax);
+			return 0;
+		}
+
+		lg->lguest_data = (struct lguest_data __user *)regs->edx;
+		/* We check here so we can simply copy_to_user/from_user */
+		if (!lguest_address_ok(lg, (long)lg->lguest_data)
+		    || !lguest_address_ok(lg, (long)(lg->lguest_data+1))){
+			kill_guest(lg, "bad guest page %p", lg->lguest_data);
+			return 0;
+		}
+		get_user(lg->noirq_start, &lg->lguest_data->noirq_start);
+		get_user(lg->noirq_end, &lg->lguest_data->noirq_end);
+		/* We reserve the top pgd entry. */
+		put_user(4U*1024*1024, &lg->lguest_data->reserve_mem);
+		put_user(lg->guestid, &lg->lguest_data->guestid);
+		put_user(clocksource_khz2mult(tsc_khz, 22),
+			 &lg->lguest_data->clock_mult);
+		return 0;
+	}
+	pending = do_hcall(lg, regs);
+	set_wakeup_process(lg, NULL);
+	return pending;
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypervisor.S
@@ -0,0 +1,170 @@
+/* This code sits at 0xFFFF1000 to do the low-level guest<->host switch.
+   Layout is: default_idt_entries (1k), then switch_to_guest entry point. */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+#define SAVE_REGS				\
+	/* Save old guest/host state */		\
+	pushl	%es;				\
+	pushl	%ds;				\
+	pushl	%fs;				\
+	pushl	%eax;				\
+	pushl	%gs;				\
+	pushl	%ebp;				\
+	pushl	%edi;				\
+	pushl	%esi;				\
+	pushl	%edx;				\
+	pushl	%ecx;				\
+	pushl	%ebx;				\
+
+.text
+ENTRY(_start) /* ld complains unless _start is defined. */
+/* %eax contains ptr to target guest state, %edx contains host idt. */
+switch_to_guest:
+	pushl	%ss
+	SAVE_REGS
+	/* Save old stack, switch to guest's stack. */
+	movl	%esp, LGUEST_STATE_host_stackptr(%eax)
+	movl	%eax, %esp
+	/* Guest registers will be at: %esp-$LGUEST_STATE_regs */
+	addl	$LGUEST_STATE_regs, %esp
+	/* Switch to guest's GDT, IDT. */
+	lgdt	LGUEST_STATE_gdt(%eax)
+	lidt	LGUEST_STATE_idt(%eax)
+	/* Save page table top. */
+	movl	%cr3, %ebx
+	movl	%ebx, LGUEST_STATE_host_pgdir(%eax)
+	/* Set host's TSS to available (clear byte 5 bit 2). */
+	movl	(LGUEST_STATE_host_gdt+2)(%eax), %ebx
+	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%ebx)
+	/* Switch to guest page tables */
+	popl	%ebx
+	movl	%ebx, %cr3
+	/* Switch to guest's TSS. */
+	movl	$(GDT_ENTRY_TSS*8), %ebx
+	ltr	%bx
+	/* Restore guest regs */
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%gs
+	/* Now we've loaded gs, neuter the TLS entries down to 1 byte/page */
+	addl	$(LGUEST_STATE_gdt_table+GDT_ENTRY_TLS_MIN*8), %eax
+	movw	$0,(%eax)
+	movw	$0,8(%eax)
+	movw	$0,16(%eax)
+	popl	%eax
+	popl	%fs
+	popl	%ds
+	popl	%es
+	/* Skip error code and trap number */
+	addl	$8, %esp
+	iret
+
+#define SWITCH_TO_HOST							\
+	SAVE_REGS;							\
+	/* Save old pgdir */						\
+	movl	%cr3, %eax;						\
+	pushl	%eax;							\
+	/* Load lguest ds segment for convenience. */			\
+	movl	$(LGUEST_DS), %eax;					\
+	movl	%eax, %ds;						\
+	/* Now figure out who we are */					\
+	movl	%esp, %eax;						\
+	subl	$LGUEST_STATE_regs, %eax;				\
+	/* Switch to host page tables (GDT, IDT and stack are in host   \
+	   mem, so need this first) */					\
+	movl	LGUEST_STATE_host_pgdir(%eax), %ebx;			\
+	movl	%ebx, %cr3;						\
+	/* Set guest's TSS to available (clear byte 5 bit 2). */	\
+	andb	$0xFD, (LGUEST_STATE_gdt_table+GDT_ENTRY_TSS*8+5)(%eax);\
+	/* Switch to host's GDT & IDT. */				\
+	lgdt	LGUEST_STATE_host_gdt(%eax);				\
+	lidt	LGUEST_STATE_host_idt(%eax);				\
+	/* Switch to host's stack. */					\
+	movl	LGUEST_STATE_host_stackptr(%eax), %esp;			\
+	/* Switch to host's TSS */					\
+	movl	$(GDT_ENTRY_TSS*8), %eax;				\
+	ltr	%ax;							\
+	/* Restore host regs */						\
+	popl	%ebx;							\
+	popl	%ecx;							\
+	popl	%edx;							\
+	popl	%esi;							\
+	popl	%edi;							\
+	popl	%ebp;							\
+	popl	%gs;							\
+	popl	%eax;							\
+	popl	%fs;							\
+	popl	%ds;							\
+	popl	%es;							\
+	popl	%ss
+	
+/* Return to run_guest_once. */
+return_to_host:
+	SWITCH_TO_HOST
+	iret
+
+deliver_to_host:
+	SWITCH_TO_HOST
+decode_idt_and_jmp:
+	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
+	 * will return to run_guest_once.  This is a feature. */
+	/* We told gcc we'd clobber edx and eax... */
+	movl	LGUEST_STATE_trapnum(%eax), %eax
+	leal	(%edx,%eax,8), %eax
+	movzwl	(%eax),%edx
+	movl	4(%eax), %eax
+	xorw	%ax, %ax
+	orl	%eax, %edx
+	jmp	*%edx
+
+deliver_to_host_with_errcode:
+	SWITCH_TO_HOST
+	pushl	LGUEST_STATE_errcode(%eax)
+	jmp decode_idt_and_jmp
+
+/* Real hardware interrupts are delivered straight to the host.  Others
+   cause us to return to run_guest_once so it can decide what to do.  Note
+   that some of these are overridden by the guest to deliver directly, and
+   never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+	.data; .long 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+ .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+	pushl	$0
+ .endif
+	pushl	$\N
+	jmp	\TARGET
+	ALIGN
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+	IRQ_STUB irq \TARGET
+  irq=irq+1
+ .endr
+.endm
+	
+/* We intercept every interrupt, because we may need to switch back to
+ * host.  Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+irq_stubs:
+.data
+default_idt_entries:	
+.text
+	IRQ_STUBS 0 1 return_to_host		/* First two traps */
+	IRQ_STUB 2 deliver_to_host_with_errcode	/* NMI */
+	IRQ_STUBS 3 31 return_to_host		/* Rest of traps */
+	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
+	IRQ_STUB 128 return_to_host		/* System call (overridden) */
+	IRQ_STUBS 129 255 deliver_to_host	/* Other real interrupts */
+
+/* Everything after this is used for the lguest_state structs. */
+ALIGN
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/interrupts_and_traps.c
@@ -0,0 +1,221 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 val)
+{
+	lhwrite_u32(lg, (u32)--(*gstack), val);
+}
+
+int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err)
+{
+	u32 __user *gstack;
+	u32 eflags, ss, irq_enable;
+	struct lguest_regs *regs = &lg->state->regs;
+
+	if (!trap->addr)
+		return 0;
+
+	/* If they want a ring change, we use new stack and push old ss/esp */
+	if ((regs->ss&0x3) != GUEST_DPL) {
+		gstack = (u32 __user *)guest_pa(lg, lg->state->tss.esp1);
+		ss = lg->state->tss.ss1;
+		push_guest_stack(lg, &gstack, regs->ss);
+		push_guest_stack(lg, &gstack, regs->esp);
+	} else {
+		gstack = (u32 __user *)guest_pa(lg, regs->esp);
+		ss = regs->ss;
+	}
+
+	/* We use IF bit in eflags to indicate whether irqs were disabled
+	   (it's always 0, since irqs are enabled when guest is running). */
+	eflags = regs->eflags;
+	get_user(irq_enable, &lg->lguest_data->irq_enabled);
+	eflags |= (irq_enable & 512);
+
+	push_guest_stack(lg, &gstack, eflags);
+	push_guest_stack(lg, &gstack, regs->cs);
+	push_guest_stack(lg, &gstack, regs->eip);
+
+	if (has_err)
+		push_guest_stack(lg, &gstack, regs->errcode);
+
+	/* Change the real stack so hypervisor returns to trap handler */
+	regs->ss = ss;
+	regs->esp = (u32)gstack + lg->page_offset;
+	regs->cs = (__KERNEL_CS|GUEST_DPL);
+	regs->eip = trap->addr;
+
+	/* GS will be neutered on way back to guest. */
+	put_user(0, &lg->lguest_data->gs_gpf_eip);
+
+	/* Disable interrupts for an interrupt gate. */
+	if (trap->disable_interrupts)
+		put_user(0, &lg->lguest_data->irq_enabled);
+	return 1;
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+	unsigned int irq;
+	DECLARE_BITMAP(irqs, LGUEST_IRQS);
+
+	if (!lg->lguest_data)
+		return;
+
+	/* If timer has changed, set timer interrupt. */
+	if (lg->timer_on && jiffies != lg->last_timer)
+		set_bit(0, lg->irqs_pending);
+
+	/* Mask out any interrupts they have blocked. */
+	copy_from_user(&irqs, lg->lguest_data->interrupts, sizeof(irqs));
+	bitmap_andnot(irqs, lg->irqs_pending, irqs, LGUEST_IRQS);
+
+	irq = find_first_bit(irqs, LGUEST_IRQS);
+	if (irq >= LGUEST_IRQS)
+		return;
+
+	/* If they're halted, we re-enable interrupts. */
+	if (lg->halted) {
+		/* Re-enable interrupts. */
+		put_user(512, &lg->lguest_data->irq_enabled);
+		lg->halted = 0;
+	} else {
+		/* Maybe they have interrupts disabled? */
+		u32 irq_enabled;
+		get_user(irq_enabled, &lg->lguest_data->irq_enabled);
+		if (!irq_enabled)
+			return;
+	}
+
+	if (lg->interrupt[irq].addr != 0) {
+		clear_bit(irq, lg->irqs_pending);
+		reflect_trap(lg, &lg->interrupt[irq], 0);
+	}
+}
+
+void check_bug_kill(struct lguest *lg)
+{
+#ifdef CONFIG_BUG
+	u32 eip = lg->state->regs.eip - PAGE_OFFSET;
+	u16 insn;
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->state->regs.eip < PAGE_OFFSET)
+		return;
+	lhread(lg, &insn, eip, sizeof(insn));
+	if (insn == 0x0b0f) {
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+		u16 l;
+		u32 f;
+		char file[128];
+		lhread(lg, &l, eip+sizeof(insn), sizeof(l));
+		lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f));
+		lhread(lg, file, f - PAGE_OFFSET, sizeof(file));
+		file[sizeof(file)-1] = 0;
+		kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l);
+#else
+		kill_guest(lg, "BUG() at %#x", eip);
+#endif	/* CONFIG_DEBUG_BUGVERBOSE */
+	}
+#endif	/* CONFIG_BUG */
+}
+
+static void copy_trap(struct lguest *lg,
+		      struct host_trap *trap,
+		      const struct desc_struct *desc)
+{
+	u8 type = ((desc->b >> 8) & 0xF);
+
+	/* Not present? */
+	if (!(desc->b & 0x8000)) {
+		trap->addr = 0;
+		return;
+	}
+	if (type != 0xE && type != 0xF)
+		kill_guest(lg, "bad IDT type %i", type);
+	trap->disable_interrupts = (type == 0xE);
+	trap->addr = ((desc->a & 0x0000FFFF) | (desc->b & 0xFFFF0000));
+}
+
+/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
+static u8 tramp[] 
+= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
+    0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+    /* movl 0, %ss:lguest_data.gs_gpf_eip */
+    0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
+};
+#define TRAMP_MOVL_TARGET_OFF 7
+#define TRAMP_JMP_TARGET_OFF 16
+
+static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
+{
+	u32 addr, off;
+
+	off = sizeof(tramp)*i;
+	memcpy(lg->trap_page + off, tramp, sizeof(tramp));
+
+	/* 0 is to be placed in lguest_data.gs_gpf_eip. */
+	addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
+	memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
+
+	/* Address is relative to where end of jmp will be. */
+	addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
+	memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
+	return (-4*1024*1024) + off;
+}
+
+/* We bounce through the trap page, for two reasons: firstly, we need
+   the interrupt destination always mapped, to avoid double faults,
+   secondly we want to reload %gs to make it innocuous on entering kernel.
+ */
+static void setup_idt(struct lguest *lg,
+		      unsigned int i,
+		      const struct desc_struct *desc)
+{
+	u8 type = ((desc->b >> 8) & 0xF);
+	u32 taddr;
+
+	/* Not present? */
+	if (!(desc->b & 0x8000)) {
+		/* FIXME: When we need this, we'll know... */
+		if (lg->state->idt_table[i].a & 0x8000)
+			kill_guest(lg, "removing interrupts not supported");
+		return;
+	}
+
+	/* We could reflect and disable interrupts, but guest can do itself. */
+	if (type != 0xF)
+		kill_guest(lg, "bad direct IDT %i type %i", i, type);
+
+	taddr = setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000));
+
+	lg->state->idt_table[i].a = (((__KERNEL_CS|GUEST_DPL)<<16)
+					| (taddr & 0x0000FFFF));
+	lg->state->idt_table[i].b = (desc->b&0xEF00)|(taddr&0xFFFF0000);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 high)
+{
+	struct desc_struct d = { low, high };
+
+	/* Ignore NMI, doublefault, hypercall, spurious interrupt. */
+	if (i == 2 || i == 8 || i == 15 || i == LGUEST_TRAP_ENTRY)
+		return;
+	/* FIXME: We should handle debug and int3 */
+	else if (i == 1 || i == 3)
+		return;
+	/* We intercept page fault, general protection fault and fpu missing */
+	else if (i == 13)
+		copy_trap(lg, &lg->gpf_trap, &d);
+	else if (i == 14)
+		copy_trap(lg, &lg->page_trap, &d);
+	else if (i == 7)
+		copy_trap(lg, &lg->fpu_trap, &d);
+	/* Other traps go straight to guest. */
+	else if (i < FIRST_EXTERNAL_VECTOR || i == SYSCALL_VECTOR)
+		setup_idt(lg, i, &d);
+	/* A virtual interrupt */
+	else if (i < FIRST_EXTERNAL_VECTOR + LGUEST_IRQS)
+		copy_trap(lg, &lg->interrupt[i-FIRST_EXTERNAL_VECTOR], &d);
+}
+
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/io.c
@@ -0,0 +1,413 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[64];
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (!dma->len[i])
+			return 1;
+		if (!lguest_address_ok(lg, dma->addr[i]))
+			goto kill;
+		if (dma->len[i] > PAGE_SIZE)
+			goto kill;
+		/* We could do over a page, but is it worth it? */
+		if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+			goto kill;
+	}
+	return 1;
+
+kill:
+	kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]);
+	return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+	return jhash2((u32*)&key->both.word,
+		      (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+		      key->both.offset)
+		% ARRAY_SIZE(dma_hash);
+}
+
+/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
+static void unlink_dma(struct lguest_dma_info *dmainfo)
+{
+	BUG_ON(down_trylock(&lguest_lock) == 0);
+	dmainfo->interrupt = 0;
+	list_del(&dmainfo->list);
+	drop_futex_key_refs(&dmainfo->key);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+	return (a->both.word == b->both.word
+		&& a->both.ptr == b->both.ptr
+		&& a->both.offset == b->both.offset);
+}
+
+static u32 unbind_dma(struct lguest *lg,
+		      const union futex_key *key,
+		      unsigned long dmas)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
+			unlink_dma(&lg->dma[i]);
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+u32 bind_dma(struct lguest *lg,
+	     unsigned long addr, unsigned long dmas, u16 numdmas, u8 interrupt)
+{
+	unsigned int i;
+	u32 ret = 0;
+	union futex_key key;
+
+	if (interrupt >= LGUEST_IRQS)
+		return 0;
+
+	down(&lguest_lock);
+	down_read(&current->mm->mmap_sem);
+	if (get_futex_key((u32 __user *)addr, &key) != 0) {
+		kill_guest(lg, "bad dma address %#lx", addr);
+		goto unlock;
+	}
+	get_futex_key_refs(&key);
+
+	if (interrupt == 0)
+		ret = unbind_dma(lg, &key, dmas);
+	else {
+		for (i = 0; i < LGUEST_MAX_DMA; i++) {
+			if (lg->dma[i].interrupt == 0) {
+				lg->dma[i].dmas = dmas;
+				lg->dma[i].num_dmas = numdmas;
+				lg->dma[i].next_dma = 0;
+				lg->dma[i].key = key;
+				lg->dma[i].guestid = lg->guestid;
+				lg->dma[i].interrupt = interrupt;
+				list_add(&lg->dma[i].list,
+					 &dma_hash[hash(&key)]);
+				ret = 1;
+				goto unlock;
+			}
+		}
+	}
+	drop_futex_key_refs(&key);
+unlock:
+ 	up_read(&current->mm->mmap_sem);
+	up(&lguest_lock);
+	return ret;
+}
+
+/* lhread from another guest */
+static int lhread_other(struct lguest *lg,
+			void *buf, u32 addr, unsigned bytes)
+{
+	if (addr + bytes < addr
+	    || !lguest_address_ok(lg, addr+bytes)
+	    || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
+		memset(buf, 0, bytes);
+		kill_guest(lg, "bad address in registered DMA struct");
+		return 0;
+	}
+	return 1;
+}
+
+/* lhwrite to another guest */
+static int lhwrite_other(struct lguest *lg, u32 addr,
+			 const void *buf, unsigned bytes)
+{
+	if (addr + bytes < addr
+	    || !lguest_address_ok(lg, addr+bytes)
+	    || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
+		!= bytes)) {
+		kill_guest(lg, "bad address writing to registered DMA");
+		return 0;
+	}
+	return 1;
+}
+
+static u32 copy_data(const struct lguest_dma *src,
+		     const struct lguest_dma *dst,
+		     struct page *pages[])
+{
+	unsigned int totlen, si, di, srcoff, dstoff;
+	void *maddr = NULL;
+
+	totlen = 0;
+	si = di = 0;
+	srcoff = dstoff = 0;
+	while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
+	       && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
+		u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
+
+		if (!maddr)
+			maddr = kmap(pages[di]);
+
+		/* FIXME: This is not completely portable, since
+		   archs do different things for copy_to_user_page. */
+		if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
+				   (void *__user)src->addr[si], len) != 0) {
+			totlen = 0;
+			break;
+		}
+
+		totlen += len;
+		srcoff += len;
+		dstoff += len;
+		if (srcoff == src->len[si]) {
+			si++;
+			srcoff = 0;
+		}
+		if (dstoff == dst->len[di]) {
+			kunmap(pages[di]);
+			maddr = NULL;
+			di++;
+			dstoff = 0;
+		}
+	}
+
+	if (maddr)
+		kunmap(pages[di]);
+
+	return totlen;
+}
+
+/* Src is us, ie. current. */
+static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
+		  struct lguest *dstlg, const struct lguest_dma *dst)
+{
+	int i;
+	u32 ret;
+	struct page *pages[LGUEST_MAX_DMA_SECTIONS];
+
+	if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
+		return 0;
+
+	/* First get the destination pages */
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (dst->len[i] == 0)
+			break;
+		if (get_user_pages(dstlg->tsk, dstlg->mm,
+				   dst->addr[i], 1, 1, 1, pages+i, NULL)
+		    != 1) {
+			ret = 0;
+			goto drop_pages;
+		}
+	}
+
+	/* Now copy until we run out of src or dst. */
+	ret = copy_data(src, dst, pages);
+
+drop_pages:
+	while (--i >= 0)
+		put_page(pages[i]);
+	return ret;
+}
+
+/* We cache one process to wakeup: helps for batching & wakes outside locks. */
+void set_wakeup_process(struct lguest *lg, struct task_struct *p)
+{
+	if (p == lg->wake)
+		return;
+
+	if (lg->wake) {
+		wake_up_process(lg->wake);
+		put_task_struct(lg->wake);
+	}
+	lg->wake = p;
+	if (lg->wake)
+		get_task_struct(lg->wake);
+}
+
+static int dma_transfer(struct lguest *srclg,
+			unsigned long udma,
+			struct lguest_dma_info *dst)
+{
+	struct lguest_dma dst_dma, src_dma;
+	struct lguest *dstlg;
+	u32 i, dma = 0;
+
+	dstlg = &lguests[dst->guestid];
+	/* Get our dma list. */
+	lhread(srclg, &src_dma, udma, sizeof(src_dma));
+
+	/* We can't deadlock against them dmaing to us, because this
+	 * is all under the lguest_lock. */
+	down_read(&dstlg->mm->mmap_sem);
+
+	for (i = 0; i < dst->num_dmas; i++) {
+		dma = (dst->next_dma + i) % dst->num_dmas;
+		if (!lhread_other(dstlg, &dst_dma,
+				  dst->dmas + dma * sizeof(struct lguest_dma),
+				  sizeof(dst_dma))) {
+			goto fail;
+		}
+		if (!dst_dma.used_len)
+			break;
+	}
+	if (i != dst->num_dmas) {
+		unsigned long used_lenp;
+		unsigned int ret;
+
+		ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
+		/* Put used length in src. */
+		lhwrite_u32(srclg,
+			    udma+offsetof(struct lguest_dma, used_len), ret);
+		if (ret == 0 && src_dma.len[0] != 0)
+			goto fail;
+
+		/* Make sure destination sees contents before length. */
+		mb();
+		used_lenp = dst->dmas
+			+ dma * sizeof(struct lguest_dma)
+			+ offsetof(struct lguest_dma, used_len);
+		lhwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
+		dst->next_dma++;
+	}
+ 	up_read(&dstlg->mm->mmap_sem);
+
+	/* Do this last so dst doesn't simply sleep on lock. */
+	set_bit(dst->interrupt, dstlg->irqs_pending);
+	set_wakeup_process(srclg, dstlg->tsk);
+	return i == dst->num_dmas;
+
+fail:
+	up_read(&dstlg->mm->mmap_sem);
+	return 0;
+}
+
+int send_dma(struct lguest *lg, unsigned long addr, unsigned long udma)
+{
+	union futex_key key;
+	int pending = 0, empty = 0;
+
+again:
+	down(&lguest_lock);
+	down_read(&current->mm->mmap_sem);
+	if (get_futex_key((u32 __user *)addr, &key) != 0) {
+		kill_guest(lg, "bad sending DMA address");
+		goto unlock;
+	}
+	/* Shared mapping?  Look for other guests... */
+	if (key.shared.offset & 1) {
+		struct lguest_dma_info *i, *n;
+		list_for_each_entry_safe(i, n, &dma_hash[hash(&key)], list) {
+			if (i->guestid == lg->guestid)
+				continue;
+			if (!key_eq(&key, &i->key))
+				continue;
+
+			empty += dma_transfer(lg, udma, i);
+			break;
+		}
+		if (empty == 1) {
+			/* Give any recipients one chance to restock. */
+			up_read(&current->mm->mmap_sem);
+			up(&lguest_lock);
+			yield();
+			empty++;
+			goto again;
+		}
+		pending = 0;
+	} else {
+		/* Private mapping: tell our userspace. */
+		lg->dma_is_pending = 1;
+		lg->pending_dma = udma;
+		lg->pending_addr = addr;
+		pending = 1;
+	}
+unlock:
+	up_read(&current->mm->mmap_sem);
+	up(&lguest_lock);
+	return pending;
+}
+
+void release_all_dma(struct lguest *lg)
+{
+	unsigned int i;
+
+	BUG_ON(down_trylock(&lguest_lock) == 0);
+
+	down_read(&lg->mm->mmap_sem);
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (lg->dma[i].interrupt)
+			unlink_dma(&lg->dma[i]);
+	}
+	up_read(&lg->mm->mmap_sem);
+}
+
+/* Userspace wants a dma buffer from this guest. */
+unsigned long get_dma_buffer(struct lguest *lg,
+			     unsigned long addr, unsigned long *interrupt)
+{
+	unsigned long ret = 0;
+	union futex_key key;
+	struct lguest_dma_info *i;
+
+	down(&lguest_lock);
+	down_read(&current->mm->mmap_sem);
+	if (get_futex_key((u32 __user *)addr, &key) != 0) {
+		kill_guest(lg, "bad registered DMA buffer");
+		goto unlock;
+	}
+	list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+		if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
+			unsigned int j;
+			for (j = 0; j < i->num_dmas; j++) {
+				struct lguest_dma dma;
+
+				ret = i->dmas + j * sizeof(struct lguest_dma);
+				lhread(lg, &dma, ret, sizeof(dma));
+				if (dma.used_len == 0)
+					break;
+			}
+			*interrupt = i->interrupt;
+			break;
+		}
+	}
+unlock:
+	up_read(&current->mm->mmap_sem);
+	up(&lguest_lock);
+	return ret;
+}
+
+void lguest_io_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+		INIT_LIST_HEAD(&dma_hash[i]);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lg.h
@@ -0,0 +1,274 @@
+#ifndef _LGUEST_H
+#define _LGUEST_H
+
+#include <asm/desc.h>
+/* 64k ought to be enough for anybody! */
+#define HYPERVISOR_SIZE 65536
+#define HYPERVISOR_PAGES (HYPERVISOR_SIZE/PAGE_SIZE)
+
+#define GDT_ENTRY_LGUEST_CS	10
+#define GDT_ENTRY_LGUEST_DS	11
+#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
+
+#if 0
+/* FIXME: Use asm-offsets here... */
+#define LGUEST_TSS_OFF		0
+#define LGUEST_TSS_SIZE		(26*4)
+#define LGUEST_GDT_OFF		(LGUEST_TSS_OFF + LGUEST_TSS_SIZE)
+#define LGUEST_GDTABLE_OFF	(LGUEST_GDT_OFF + 8)
+#define LGUEST_GDTABLE_SIZE	(8 * GDT_ENTRIES)
+#define LGUEST_IDT_OFF		(LGUEST_GDTABLE_OFF + LGUEST_GDTABLE_SIZE)
+#define LGUEST_IDTABLE_SIZE	(8 * IDT_ENTRIES)
+#define LGUEST_IDTABLE_OFF	(LGUEST_IDT_OFF + 8)
+#define LGUEST_HOST_OFF		(LGUEST_IDTABLE_OFF + LGUEST_IDTABLE_SIZE)
+#define LGUEST_HOST_GDT_OFF	LGUEST_HOST_OFF
+#define LGUEST_HOST_IDT_OFF	(LGUEST_HOST_OFF + 8)
+#define LGUEST_HOST_PGDIR_OFF	(LGUEST_HOST_IDT_OFF + 8)
+#define LGUEST_HOST_STKP_OFF	(LGUEST_HOST_PGDIR_OFF + 4)
+#define LGUEST_HOST_SIZE	(8+8+4+4)
+#define LGUEST_REGS_OFF		(LGUEST_HOST_OFF + LGUEST_HOST_SIZE)	
+#define LGUEST_TRAPNUM_OFF	(LGUEST_REGS_OFF + 12*4)
+#define LGUEST_ERRCODE_OFF	(LGUEST_REGS_OFF + 13*4)
+#endif
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <linux/binfmts.h>
+#include <linux/futex.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/semaphore.h>
+#include "irq_vectors.h"
+
+#define GUEST_DPL 1
+
+struct lguest_regs
+{
+	/* Manually saved part. */
+	u32 cr3;
+	u32 ebx, ecx, edx;
+	u32 esi, edi, ebp;
+	u32 gs;
+	u32 eax;
+	u32 fs, ds, es;
+	u32 trapnum, errcode;
+	/* Trap pushed part */
+	u32 eip;
+	u32 cs;
+	u32 eflags;
+	u32 esp;
+	u32 ss;
+};
+
+__exit void free_pagetables(void);
+__init int init_pagetables(struct page *hype_pages);
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) 
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) 
+
+/* Simplified version of IDT. */
+struct host_trap
+{
+	unsigned long addr;
+	int disable_interrupts;
+};
+
+struct lguest_dma_info
+{
+	struct list_head list;
+	union futex_key key;
+	unsigned long dmas;
+	u16 next_dma;
+	u16 num_dmas;
+	u16 guestid;
+	u8 interrupt; 	/* 0 when not registered */
+};
+
+struct pgdir
+{
+	u32 cr3;
+	u32 *pgdir;
+};
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+	struct lguest_state *state;
+	struct lguest_data __user *lguest_data;
+	struct task_struct *tsk;
+	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
+	u16 guestid;
+	u32 pfn_limit;
+	u32 page_offset;
+	u32 cr2;
+	int timer_on;
+	int halted;
+	int ts;
+	u32 gpf_eip;
+	u32 last_timer;
+	u32 next_hcall;
+	u16 tls_limits[GDT_ENTRY_TLS_ENTRIES];
+
+	/* We keep a small number of these. */
+	u32 pgdidx;
+	struct pgdir pgdirs[4];
+	void *trap_page;
+
+	/* Cached wakeup: we hold a reference to this task. */
+	struct task_struct *wake;
+
+	unsigned long noirq_start, noirq_end;
+	int dma_is_pending;
+	unsigned long pending_dma; /* struct lguest_dma */
+	unsigned long pending_addr; /* address they're sending to */
+
+	unsigned int stack_pages;
+
+	struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+	/* Dead? */
+	const char *dead;
+
+	/* We intercept page fault (demand shadow paging & cr2 saving)
+	   protection fault (in/out emulation, TLS handling) and
+	   device not available (TS handling). */
+	struct host_trap page_trap, gpf_trap, fpu_trap;
+
+	/* Virtual interrupts */
+	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+	struct host_trap interrupt[LGUEST_IRQS];
+};
+
+extern struct page *hype_pages; /* Contiguous pages. */
+extern struct lguest lguests[];
+extern struct semaphore lguest_lock;
+
+/* core.c: */
+/* Entry points in hypervisor */
+const unsigned long *__lguest_default_idt_entries(void);
+struct lguest_state *__lguest_states(void);
+u32 lhread_u32(struct lguest *lg, u32 addr);
+void lhwrite_u32(struct lguest *lg, u32 val, u32 addr);
+void lhread(struct lguest *lg, void *buf, u32 addr, unsigned bytes);
+void lhwrite(struct lguest *lg, u32 addr, const void *buf, unsigned bytes);
+int lguest_address_ok(const struct lguest *lg, unsigned long addr);
+int run_guest(struct lguest *lg, char *__user user);
+int find_free_guest(void);
+
+/* interrupts_and_traps.c: */
+void maybe_do_interrupt(struct lguest *lg);
+int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err);
+void check_bug_kill(struct lguest *lg);
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+
+/* segments.c: */
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num);
+void guest_load_tls(struct lguest *lg,
+		    const struct desc_struct __user *tls_array);
+
+int init_guest_pagetable(struct lguest *lg, u32 pgtable);
+void free_guest_pagetable(struct lguest *lg);
+void guest_new_pagetable(struct lguest *lg, u32 pgtable);
+void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 i);
+void guest_pagetable_clear_all(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest *lg);
+void guest_set_pte(struct lguest *lg, unsigned long cr3,
+		   unsigned long vaddr, u32 val);
+void map_trap_page(struct lguest *info);
+int demand_page(struct lguest *info, u32 cr2, int write);
+void pin_stack_pages(struct lguest *lg);
+
+int lguest_device_init(void);
+void lguest_device_remove(void);
+void lguest_io_init(void);
+u32 bind_dma(struct lguest *lg,
+	     unsigned long addr, unsigned long udma, u16 numdmas,u8 interrupt);
+int send_dma(struct lguest *info, unsigned long addr,
+	     unsigned long udma);
+void release_all_dma(struct lguest *lg);
+unsigned long get_dma_buffer(struct lguest *lg, unsigned long addr,
+			     unsigned long *interrupt);
+
+void set_wakeup_process(struct lguest *lg, struct task_struct *p);
+int do_async_hcalls(struct lguest *info);
+int hypercall(struct lguest *info, struct lguest_regs *regs);
+
+#define kill_guest(lg, fmt...)					\
+do {								\
+	if (!(lg)->dead) {					\
+		(lg)->dead = kasprintf(GFP_ATOMIC, fmt);	\
+		if (!(lg)->dead)				\
+			(lg)->dead = (void *)1;			\
+	}							\
+} while(0)
+
+static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+	return vaddr - lg->page_offset;
+}
+
+/* Hardware-defined TSS structure. */
+struct x86_tss
+{
+	unsigned short	back_link,__blh;
+	unsigned long	esp0;
+	unsigned short	ss0,__ss0pad;
+	unsigned long	esp1;
+	unsigned short	ss1,__ss1pad;
+	unsigned long	esp2;
+	unsigned short	ss2,__ss2pad;
+	unsigned long	cr3;
+	unsigned long	eip;
+	unsigned long	eflags;
+	unsigned long	eax,ecx,edx,ebx;
+	unsigned long	esp; /* We actually use this one to save esp. */
+	unsigned long	ebp;
+	unsigned long	esi;
+	unsigned long	edi;
+	unsigned short	es, __espad;
+	unsigned short	cs, __cspad;
+	unsigned short	ss, __sspad;
+	unsigned short	ds, __dspad;
+	unsigned short	fs, __fspad;
+	unsigned short	gs, __gspad;
+	unsigned short	ldt, __ldtpad;
+	unsigned short	trace, io_bitmap_base;
+};
+
+int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
+		    struct lguest_regs *regs, struct x86_tss *tss);
+
+struct lguest_host_state
+{
+	struct Xgt_desc_struct	gdt;
+	struct Xgt_desc_struct	idt;
+	unsigned long		pgdir;
+	unsigned long		stackptr;
+};
+
+/* This sits in the high-mapped shim. */
+struct lguest_state
+{
+	/* Task struct. */
+	struct x86_tss tss;
+
+	/* Gate descriptor table. */
+	struct Xgt_desc_struct gdt;
+	struct desc_struct gdt_table[GDT_ENTRIES];
+
+	/* Interrupt descriptor table. */
+	struct Xgt_desc_struct idt;
+	struct desc_struct idt_table[IDT_ENTRIES];
+
+	/* Host state we store while the guest runs. */
+	struct lguest_host_state host;
+
+	/* This is the stack on which we push our regs. */
+	struct lguest_regs regs;
+};
+#endif	/* __ASSEMBLY__ */
+#endif	/* _LGUEST_H */
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest.c
@@ -0,0 +1,595 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2006, Rusty Russell <rusty at rustcorp.com.au> IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <asm/paravirt.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/pda.h>
+#include <asm/asm-offsets.h>
+
+extern int mce_disabled;
+
+struct lguest_data lguest_data;
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot = __va(0);
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	/* Note: This code assumes we're uniprocessor. */
+	static unsigned int next_call;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (lguest_data.hcall_status[next_call] != 0xFF) {
+		/* Table full, so do normal hcall which will flush table. */
+		hcall(call, arg1, arg2, arg3);
+	} else {
+		lguest_data.hcalls[next_call].eax = call;
+		lguest_data.hcalls[next_call].edx = arg1;
+		lguest_data.hcalls[next_call].ebx = arg2;
+		lguest_data.hcalls[next_call].ecx = arg3;
+		wmb();
+		lguest_data.hcall_status[next_call] = 0;
+		if (++next_call == LHCALL_RING_SIZE)
+			next_call = 0;
+	}
+	local_irq_restore(flags);
+}
+
+#ifdef PARAVIRT_LAZY_NONE 	/* Not in 2.6.20. */
+static int lazy_mode;
+static void fastcall lguest_lazy_mode(int mode)
+{
+	lazy_mode = mode;
+	if (mode == PARAVIRT_LAZY_NONE)
+		hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2,
+		       unsigned long arg3)
+{
+	if (lazy_mode == PARAVIRT_LAZY_NONE)
+		hcall(call, arg1, arg2, arg3);
+	else
+		async_hcall(call, arg1, arg2, arg3);
+}
+#else
+#define lazy_hcall hcall
+#endif
+
+static unsigned long fastcall save_fl(void)
+{
+	return lguest_data.irq_enabled;
+}
+
+static void fastcall restore_fl(unsigned long flags)
+{
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = flags;
+}
+
+static void fastcall irq_disable(void)
+{
+	lguest_data.irq_enabled = 0;
+}
+
+static void fastcall irq_enable(void)
+{
+	/* Linux i386 code expects bit 9 set. */
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = 512;
+}
+
+static void fastcall lguest_load_gdt(const struct Xgt_desc_struct *desc)
+{
+	BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
+	hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+}
+
+static void fastcall lguest_load_idt(const struct Xgt_desc_struct *desc)
+{
+	unsigned int i;
+	struct desc_struct *idt = (void *)desc->address;
+
+	for (i = 0; i < (desc->size+1)/8; i++)
+		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+	hcall(LHCALL_CRASH, __pa(p), 0, 0);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+	.notifier_call = lguest_panic
+};
+
+static cycle_t lguest_clock_read(void)
+{
+	/* FIXME: This is just the native one.  Account stolen time! */
+	return paravirt_ops.read_tsc();
+}
+
+/* FIXME: Update iff tsc rate changes. */
+static struct clocksource lguest_clock = {
+	.name			= "lguest",
+	.rating			= 400,
+	.read			= lguest_clock_read,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.is_continuous		= 1,
+};
+
+static char *lguest_memory_setup(void)
+{
+	/* We do these here because lockcheck barfs if before start_kernel */
+	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+	lguest_clock.mult = lguest_data.clock_mult;
+	clocksource_register(&lguest_clock);
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+	return "LGUEST";
+}
+
+static fastcall void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	int is_feature = (*eax == 1);
+
+	asm volatile ("cpuid"
+		      : "=a" (*eax),
+			"=b" (*ebx),
+			"=c" (*ecx),
+			"=d" (*edx)
+		      : "0" (*eax), "2" (*ecx));
+
+	if (is_feature) {
+		unsigned long *excap = (unsigned long *)ecx,
+			*features = (unsigned long *)edx;
+		/* Hypervisor needs to know when we flush kernel pages. */
+		set_bit(X86_FEATURE_PGE, features);
+		/* We don't have any features! */
+		clear_bit(X86_FEATURE_VME, features);
+		clear_bit(X86_FEATURE_DE, features);
+		clear_bit(X86_FEATURE_PSE, features);
+		clear_bit(X86_FEATURE_PAE, features);
+		clear_bit(X86_FEATURE_SEP, features);
+		clear_bit(X86_FEATURE_APIC, features);
+		clear_bit(X86_FEATURE_MTRR, features);
+		/* No MWAIT, either */
+		clear_bit(3, excap);
+	}
+}
+
+static unsigned long current_cr3;
+static void fastcall lguest_write_cr3(unsigned long cr3)
+{
+	hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+	current_cr3 = cr3;
+}
+
+static void fastcall lguest_flush_tlb(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void fastcall lguest_flush_tlb_kernel(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void fastcall lguest_flush_tlb_single(u32 addr)
+{
+	/* Simply set it to zero, and it will fault back in. */
+	lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
+}
+
+/* FIXME: Eliminate all callers of this. */
+static fastcall void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	/* Don't bother with hypercall before initial setup. */
+	if (current_cr3)
+		hcall(LHCALL_SET_UNKNOWN_PTE, 0, 0, 0);
+}
+
+static fastcall void lguest_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+}
+
+/* We only support two-level pagetables at the moment. */
+static fastcall void lguest_set_pud(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+	lazy_hcall(LHCALL_SET_PUD, __pa(pmdp)&PAGE_MASK,
+		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static fastcall void lguest_apic_write(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall void lguest_apic_write_atomic(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall unsigned long lguest_apic_read(unsigned long reg)
+{
+	return 0;
+}
+#endif
+
+/* We move eflags word to lguest_data.irq_enabled to restore interrupt
+   state.  For page faults, gpfs and virtual interrupts, the
+   hypervisor has saved eflags manually, otherwise it was delivered
+   directly and so eflags reflects the real machine IF state,
+   ie. interrupts on.  Since the kernel always dies if it takes such a
+   trap with interrupts disabled anyway, turning interrupts back on
+   unconditionally here is OK. */
+asm("lguest_iret:"
+    " pushl	%eax;"
+    " movl	12(%esp), %eax;"
+    "lguest_noirq_start:;"
+    " movl	%eax,%ss:lguest_data+"__stringify(LGUEST_DATA_irq_enabled)";"
+    " popl	%eax;"
+    " iret;"
+    "lguest_noirq_end:");
+extern void fastcall lguest_iret(void);
+extern char lguest_noirq_start[], lguest_noirq_end[];
+
+static void fastcall lguest_load_esp0(struct tss_struct *tss,
+				     struct thread_struct *thread)
+{
+	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
+		   THREAD_SIZE/PAGE_SIZE);
+}
+
+static fastcall void lguest_load_tr_desc(void)
+{
+}
+
+static fastcall void lguest_set_ldt(const void *addr, unsigned entries)
+{
+	/* FIXME: Implement. */
+	BUG_ON(entries);
+}
+
+static fastcall void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static fastcall void lguest_set_debugreg(int regno, unsigned long value)
+{
+	/* FIXME: Implement */
+}
+
+static unsigned int lguest_cr0;
+static fastcall void lguest_clts(void)
+{
+	lazy_hcall(LHCALL_TS, 0, 0, 0);
+	lguest_cr0 &= ~8U;
+}
+
+static fastcall unsigned long lguest_read_cr0(void)
+{
+	return lguest_cr0;
+}
+
+static fastcall void lguest_write_cr0(unsigned long val)
+{
+	hcall(LHCALL_TS, val & 8, 0, 0);
+	lguest_cr0 = val;
+}
+
+static fastcall unsigned long lguest_read_cr2(void)
+{
+	return lguest_data.cr2;
+}
+
+static fastcall unsigned long lguest_read_cr3(void)
+{
+	return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static fastcall unsigned long lguest_read_cr4(void)
+{
+	return 0;
+}
+
+static fastcall void lguest_write_cr4(unsigned long val)
+{
+}
+
+/* FIXME: These should be in a header somewhere */
+extern unsigned long init_pg_tables_end;
+
+static void fastcall lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+	do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+	update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+	set_bit(irq, lguest_data.interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+	clear_bit(irq, lguest_data.interrupts);
+	/* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+	.name		= "lguest",
+	.mask		= disable_lguest_irq,
+	.mask_ack	= disable_lguest_irq,
+	.unmask		= enable_lguest_irq,
+};
+
+static void lguest_time_init(void)
+{
+	set_irq_handler(0, lguest_time_irq);
+	hcall(LHCALL_TIMER_START,HZ,0,0);
+}
+
+static void __init lguest_init_IRQ(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_IRQS; i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		if (vector != SYSCALL_VECTOR) {
+			set_intr_gate(vector, interrupt[i]);
+			set_irq_chip_and_handler(i, &lguest_irq_controller,
+						 handle_level_irq);
+		}
+	}
+	irq_ctx_init(smp_processor_id());
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
+static fastcall void lguest_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	/* FIXME: Allow this. */
+	BUG();
+}
+
+static fastcall void lguest_write_gdt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static fastcall void lguest_write_idt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+#define LGUEST_IRQ "lguest_data+"__stringify(LGUEST_DATA_irq_enabled)
+#define DEF_LGUEST(name, code)				\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name ":")
+DEF_LGUEST(cli, "movl $0," LGUEST_IRQ);
+DEF_LGUEST(sti, "movl $512," LGUEST_IRQ);
+DEF_LGUEST(popf, "movl %eax," LGUEST_IRQ);
+DEF_LGUEST(pushf, "movl " LGUEST_IRQ ",%eax");
+DEF_LGUEST(pushf_cli, "movl " LGUEST_IRQ ",%eax; movl $0," LGUEST_IRQ);
+DEF_LGUEST(iret, ".byte 0xE9,0,0,0,0"); /* jmp ... */
+
+static const struct lguest_insns
+{
+	const char *start, *end;
+} lguest_insns[] = {
+	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+		return len;
+
+	insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return len;
+
+	memcpy(insns, lguest_insns[type].start, insn_len);
+	if (type == PARAVIRT_INTERRUPT_RETURN) {
+		/* Jumps are relative. */
+		u32 off = (u32)lguest_iret - ((u32)insns + insn_len);
+		memcpy(insns+1, &off, sizeof(off));
+	}
+	return insn_len;
+}
+
+static void fastcall lguest_safe_halt(void)
+{
+	hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+	return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static __attribute_used__ __init void lguest_init(void)
+{
+	extern struct Xgt_desc_struct cpu_gdt_descr;
+	extern struct i386_pda boot_pda;
+
+	paravirt_ops.name = "lguest";
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.kernel_rpl = 1;
+
+	paravirt_ops.save_fl = save_fl;
+	paravirt_ops.restore_fl = restore_fl;
+	paravirt_ops.irq_disable = irq_disable;
+	paravirt_ops.irq_enable = irq_enable;
+	paravirt_ops.load_gdt = lguest_load_gdt;
+	paravirt_ops.memory_setup = lguest_memory_setup;
+	paravirt_ops.cpuid = lguest_cpuid;
+	paravirt_ops.write_cr3 = lguest_write_cr3;
+	paravirt_ops.flush_tlb_user = lguest_flush_tlb;
+	paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+	paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+	paravirt_ops.set_pte = lguest_set_pte;
+	paravirt_ops.set_pte_at = lguest_set_pte_at;
+	paravirt_ops.set_pmd = lguest_set_pud;
+#ifdef CONFIG_X86_LOCAL_APIC
+	paravirt_ops.apic_write = lguest_apic_write;
+	paravirt_ops.apic_write_atomic = lguest_apic_write_atomic;
+	paravirt_ops.apic_read = lguest_apic_read;
+#endif
+	paravirt_ops.load_idt = lguest_load_idt;
+	paravirt_ops.iret = lguest_iret;
+	paravirt_ops.load_esp0 = lguest_load_esp0;
+	paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+	paravirt_ops.set_ldt = lguest_set_ldt;
+	paravirt_ops.load_tls = lguest_load_tls;
+	paravirt_ops.set_debugreg = lguest_set_debugreg;
+	paravirt_ops.clts = lguest_clts;
+	paravirt_ops.read_cr0 = lguest_read_cr0;
+	paravirt_ops.write_cr0 = lguest_write_cr0;
+	paravirt_ops.init_IRQ = lguest_init_IRQ;
+	paravirt_ops.read_cr2 = lguest_read_cr2;
+	paravirt_ops.read_cr3 = lguest_read_cr3;
+	paravirt_ops.read_cr4 = lguest_read_cr4;
+	paravirt_ops.write_cr4 = lguest_write_cr4;
+	paravirt_ops.write_ldt_entry = lguest_write_ldt_entry;
+	paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+	paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+	paravirt_ops.patch = lguest_patch;
+	paravirt_ops.safe_halt = lguest_safe_halt;
+	paravirt_ops.get_wallclock = lguest_get_wallclock;
+	paravirt_ops.time_init = lguest_time_init;
+#ifdef PARAVIRT_LAZY_NONE
+	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+#endif
+
+	memset(lguest_data.hcall_status,0xFF,sizeof(lguest_data.hcall_status));
+	lguest_data.noirq_start = (u32)lguest_noirq_start;
+	lguest_data.noirq_end = (u32)lguest_noirq_end;
+	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+	strncpy(saved_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+	/* We use top of mem for initial pagetables. */
+	init_pg_tables_end = __pa(pg0);
+
+	/* set up PDA descriptor */
+	pack_descriptor((u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].a,
+			(u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].b,
+			(unsigned)&boot_pda, sizeof(boot_pda)-1,
+			0x80 | DESCTYPE_S | 0x02, 0);
+	load_gdt(&cpu_gdt_descr);
+	asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+
+	reserve_top_address(lguest_data.reserve_mem);
+
+	cpu_detect(&new_cpu_data);
+	/* Need this before paging_init. */
+	set_bit(X86_FEATURE_PGE, new_cpu_data.x86_capability);
+	/* Math is always hard! */
+	new_cpu_data.hard_math = 1;
+
+	/* FIXME: Better way? */
+	/* Suppress vgacon startup code */
+	SCREEN_INFO.orig_video_isVGA = VIDEO_TYPE_VLFB;
+
+	add_preferred_console("hvc", 0, NULL);
+
+#ifdef CONFIG_X86_MCE
+	mce_disabled = 1;
+#endif
+
+#ifdef CONFIG_ACPI
+	acpi_disabled = 1;
+	acpi_ht = 0;
+#endif
+	if (boot->initrd_size) {
+		/* We stash this at top of memory. */
+		INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+		INITRD_SIZE = boot->initrd_size;
+		LOADER_TYPE = 0xFF;
+	}
+
+	pm_power_off = lguest_power_off;
+	start_kernel();
+}
+
+asm("lguest_maybe_init:\n"
+    "	cmpl $"__stringify(LGUEST_MAGIC_EBP)", %ebp\n"
+    "	jne 1f\n"
+    "	cmpl $"__stringify(LGUEST_MAGIC_EDI)", %edi\n"
+    "	jne 1f\n"
+    "	cmpl $"__stringify(LGUEST_MAGIC_ESI)", %esi\n"
+    "	je lguest_init\n"
+    "1: ret");
+extern void asmlinkage lguest_maybe_init(void);
+paravirt_probe(lguest_maybe_init);
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest_bus.c
@@ -0,0 +1,180 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/lguest_device.h>
+#include <asm/lguest.h>
+#include <asm/io.h>
+
+static ssize_t type_show(struct device *_dev,
+                         struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hu", lguest_devices[dev->index].type);
+}
+static ssize_t features_show(struct device *_dev,
+                             struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hx", lguest_devices[dev->index].features);
+}
+static ssize_t pfn_show(struct device *_dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
+}
+static ssize_t status_show(struct device *_dev,
+                           struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hx", lguest_devices[dev->index].status);
+}
+static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
+		return -EINVAL;
+	return count;
+}
+static struct device_attribute lguest_dev_attrs[] = {
+	__ATTR_RO(type),
+	__ATTR_RO(features),
+	__ATTR_RO(pfn),
+	__ATTR(status, 0644, status_show, status_store),
+	__ATTR_NULL
+};
+
+static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
+
+	return (drv->device_type == lguest_devices[dev->index].type);
+}
+
+struct lguest_bus {
+	struct bus_type bus;
+	struct device dev;
+};
+
+static struct lguest_bus lguest_bus = {
+	.bus = {
+		.name  = "lguest",
+		.match = lguest_dev_match,
+		.dev_attrs = lguest_dev_attrs,
+	},
+	.dev = {
+		.parent = NULL,
+		.bus_id = "lguest",
+	}
+};
+
+static int lguest_dev_probe(struct device *_dev)
+{
+	int ret;
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(dev->dev.driver,
+						struct lguest_driver, drv);
+
+	lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
+	ret = drv->probe(dev);
+	if (ret == 0)
+		lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
+	return ret;
+}
+
+static int lguest_dev_remove(struct device *_dev)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(dev->dev.driver,
+						struct lguest_driver, drv);
+
+	if (dev->dev.driver && drv->remove)
+		drv->remove(dev);
+	put_device(&dev->dev);
+	return 0;
+}
+
+int register_lguest_driver(struct lguest_driver *drv)
+{
+	if (!lguest_devices)
+		return 0;
+	
+	drv->drv.bus = &lguest_bus.bus;
+	drv->drv.name = drv->name;
+	drv->drv.owner = drv->owner;
+	drv->drv.probe = lguest_dev_probe;
+	drv->drv.remove = lguest_dev_remove;
+
+	return driver_register(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(register_lguest_driver);
+
+void unregister_lguest_driver(struct lguest_driver *drv)
+{
+	if (!lguest_devices)
+		return;
+
+	driver_unregister(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(unregister_lguest_driver);
+
+static void release_lguest_device(struct device *_dev)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+
+	lguest_devices[dev->index].status |= LGUEST_DEVICE_S_REMOVED_ACK;
+	kfree(dev);
+}
+
+static void add_lguest_device(unsigned int index)
+{
+	struct lguest_device *new;
+
+	lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
+	new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
+		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+		return;
+	}
+
+	new->index = index;
+	new->private = NULL;
+	memset(&new->dev, 0, sizeof(new->dev));
+	new->dev.parent = &lguest_bus.dev;
+	new->dev.bus = &lguest_bus.bus;
+	new->dev.release = release_lguest_device;
+	sprintf(new->dev.bus_id, "%u", index);
+	if (device_register(&new->dev) != 0) {
+		printk(KERN_EMERG "Cannot register lguest device %u\n", index);
+		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+		kfree(new);
+	}
+}
+
+static void scan_devices(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_MAX_DEVICES; i++)
+		if (lguest_devices[i].type)
+			add_lguest_device(i);
+}
+
+static int __init lguest_bus_init(void)
+{
+	if (strcmp(paravirt_ops.name, "lguest") != 0)
+		return 0;
+
+	/* Devices are in page above top of "normal" mem. */
+	lguest_devices = ioremap(max_pfn << PAGE_SHIFT, PAGE_SIZE);
+
+	if (bus_register(&lguest_bus.bus) != 0
+	    || device_register(&lguest_bus.dev) != 0)
+		panic("lguest bus registration failed");
+
+	scan_devices();
+	return 0;
+}
+postcore_initcall(lguest_bus_init);
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest_user.c
@@ -0,0 +1,242 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "lg.h"
+
+static struct lguest_state *setup_guest_state(unsigned int num, void *pgdir,
+					      unsigned long start)
+{
+	struct lguest_state *guest = &__lguest_states()[num];
+	unsigned int i;
+	const long *def = __lguest_default_idt_entries();
+	struct lguest_regs *regs;
+
+	guest->gdt_table[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+	guest->gdt_table[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+	guest->gdt.size = GDT_ENTRIES*8-1;
+	guest->gdt.address = (unsigned long)&guest->gdt_table;
+
+	/* Other guest's IDTs are initialized from default. */
+	guest->idt.size = 8 * IDT_ENTRIES;
+	guest->idt.address = (long)guest->idt_table;
+	for (i = 0; i < IDT_ENTRIES; i++) {
+		u32 flags = 0x8e00;
+
+		/* They can't "int" into any of them except hypercall. */
+		if (i == LGUEST_TRAP_ENTRY)
+			flags |= (GUEST_DPL << 13);
+
+		guest->idt_table[i].a = (LGUEST_CS<<16) | (def[i]&0x0000FFFF);
+		guest->idt_table[i].b = (def[i]&0xFFFF0000) | flags;
+	}
+
+	memset(&guest->tss, 0, sizeof(guest->tss));
+	guest->tss.ss0 = LGUEST_DS;
+	guest->tss.esp0 = (unsigned long)(guest+1);
+	guest->tss.io_bitmap_base = sizeof(guest->tss); /* No I/O for you! */
+
+	/* Write out stack in format lguest expects, so we can switch to it. */
+	regs = &guest->regs;
+	regs->cr3 = __pa(pgdir);
+	regs->eax = regs->ebx = regs->ecx = regs->edx = regs->esp = 0;
+	regs->edi = LGUEST_MAGIC_EDI;
+	regs->ebp = LGUEST_MAGIC_EBP;
+	regs->esi = LGUEST_MAGIC_ESI;
+	regs->gs = regs->fs = 0;
+	regs->ds = regs->es = __KERNEL_DS|GUEST_DPL;
+	regs->trapnum = regs->errcode = 0;
+	regs->eip = start;
+	regs->cs = __KERNEL_CS|GUEST_DPL;
+	regs->eflags = 0x202; 	/* Interrupts enabled. */
+	regs->ss = __KERNEL_DS|GUEST_DPL;
+
+	if (!fixup_gdt_table(guest->gdt_table, ARRAY_SIZE(guest->gdt_table),
+			     &guest->regs, &guest->tss))
+		return NULL;
+
+	return guest;
+}
+
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long addr, udma, irq;
+
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	udma = get_dma_buffer(lg, addr, &irq);
+	if (!udma)
+		return -ENOENT;
+
+	/* We put irq number in udma->used_len. */
+	lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+	return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+	u32 irq;
+
+	if (get_user(irq, input) != 0)
+		return -EFAULT;
+	if (irq >= LGUEST_IRQS)
+		return -EINVAL;
+	set_bit(irq, lg->irqs_pending);
+	return 0;
+}
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return -EINVAL;
+
+	if (lg->dead) {
+		size_t len;
+
+		if (lg->dead == (void *)-1)
+			return -ENOMEM;
+
+		len = min(size, strlen(lg->dead)+1);
+		if (copy_to_user(user, lg->dead, len) != 0)
+			return -EFAULT;
+		return len;
+	}
+
+	if (lg->dma_is_pending)
+		lg->dma_is_pending = 0;
+
+	return run_guest(lg, user);
+}
+
+/* Take: pfnlimit, pgdir, start, pageoffset. */
+static int initialize(struct file *file, const u32 __user *input)
+{
+	struct lguest *lg;
+	int err, i;
+	u32 args[4];
+
+	if (file->private_data)
+		return -EBUSY;
+
+	if (copy_from_user(args, input, sizeof(args)) != 0)
+		return -EFAULT;
+
+	if (args[1] <= PAGE_SIZE)
+		return -EINVAL;
+
+	down(&lguest_lock);
+	i = find_free_guest();
+	if (i < 0) {
+		err = -ENOSPC;
+		goto unlock;
+	}
+	lg = &lguests[i];
+	lg->guestid = i;
+	lg->pfn_limit = args[0];
+	lg->page_offset = args[3];
+
+	lg->trap_page = (u32 *)get_zeroed_page(GFP_KERNEL);
+	if (!lg->trap_page) {
+		err = -ENOMEM;
+		goto release_guest;
+	}
+
+	err = init_guest_pagetable(lg, args[1]);
+	if (err)
+		goto free_trap_page;
+
+	lg->state = setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+	if (!lg->state) {
+		err = -ENOEXEC;
+		goto release_pgtable;
+	}
+	up(&lguest_lock);
+
+	lg->tsk = current;
+	lg->mm = get_task_mm(current);
+	file->private_data = lg;
+	return sizeof(args);
+
+release_pgtable:
+	free_guest_pagetable(lg);
+free_trap_page:
+	free_page((long)lg->trap_page);
+release_guest:
+	memset(lg, 0, sizeof(*lg));
+unlock:
+	up(&lguest_lock);
+	return err;
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+		     size_t size, loff_t *off)
+{
+	struct lguest *lg = file->private_data;
+	u32 req;
+
+	if (get_user(req, input) != 0)
+		return -EFAULT;
+	input += sizeof(req);
+
+	if (req != LHREQ_INITIALIZE && !lg)
+		return -EINVAL;
+	if (lg && lg->dead)
+		return -ENOENT;
+
+	switch (req) {
+	case LHREQ_INITIALIZE:
+		return initialize(file, (const u32 __user *)input);
+	case LHREQ_GETDMA:
+		return user_get_dma(lg, (const u32 __user *)input);
+	case LHREQ_IRQ:
+		return user_send_irq(lg, (const u32 __user *)input);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return 0;
+
+	down(&lguest_lock);
+	release_all_dma(lg);
+	free_page((long)lg->trap_page);
+	free_guest_pagetable(lg);
+	mmput(lg->mm);
+	if (lg->dead != (void *)1)
+		kfree(lg->dead);
+	memset(lg->state, 0, sizeof(*lg->state));
+	memset(lg, 0, sizeof(*lg));
+	up(&lguest_lock);
+	return 0;
+}
+
+static struct file_operations lguest_fops = {
+	.owner	 = THIS_MODULE,
+	.release = close,
+	.write	 = write,
+	.read	 = read,
+};
+static struct miscdevice lguest_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "lguest",
+	.fops	= &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+	return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+	misc_deregister(&lguest_dev);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/page_tables.c
@@ -0,0 +1,374 @@
+/* Shadow page table operations.
+ * Copyright (C) Rusty Russell IBm Corporation 2006.
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include "lg.h"
+
+#define PTES_PER_PAGE_SHIFT 10
+#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
+#define HYPERVISOR_PGD_ENTRY (PTES_PER_PAGE - 1)
+
+static DEFINE_PER_CPU(u32 *, hypervisor_pte_pages) = { NULL };
+#define hypervisor_pte_page(cpu) per_cpu(hypervisor_pte_pages, cpu)
+
+static unsigned vaddr_to_pgd(unsigned long vaddr)
+{
+	return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+}
+
+/* These access the real versions. */
+static u32 *toplev(struct lguest *lg, u32 i, unsigned long vaddr)
+{
+	unsigned int index = vaddr_to_pgd(vaddr);
+
+	if (index >= HYPERVISOR_PGD_ENTRY) {
+		kill_guest(lg, "attempt to access hypervisor pages");
+		index = 0;
+	} 
+	return &lg->pgdirs[i].pgdir[index];
+}
+
+static u32 *pteof(struct lguest *lg, u32 top, unsigned long vaddr)
+{
+	u32 *page = __va(top&PAGE_MASK);
+	BUG_ON(!(top & _PAGE_PRESENT));
+	return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+}
+
+/* These access the guest versions. */
+static u32 gtoplev(struct lguest *lg, unsigned long vaddr)
+{
+	unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+	return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(u32);
+}
+
+static u32 gpteof(struct lguest *lg, u32 gtop, unsigned long vaddr)
+{
+	u32 gpage = (gtop&PAGE_MASK);
+	BUG_ON(!(gtop & _PAGE_PRESENT));
+	return gpage + ((vaddr >> PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(u32);
+}
+
+static void release_pte(u32 pte)
+{
+	if (pte & _PAGE_PRESENT)
+		put_page(pfn_to_page(pte >> PAGE_SHIFT));
+}
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+	struct vm_area_struct *vma;
+	struct page *page;
+	unsigned long ret = -1UL;
+
+	down_read(&current->mm->mmap_sem);
+	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+			   1, write, 1, &page, &vma) == 1)
+		ret = page_to_pfn(page);
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static u32 check_pgtable_entry(struct lguest *lg, u32 entry)
+{
+	if ((entry & (_PAGE_PWT|_PAGE_PSE))
+	    || (entry >> PAGE_SHIFT) >= lg->pfn_limit)
+		kill_guest(lg, "bad page table entry");
+	return entry & ~_PAGE_GLOBAL;
+}
+
+static u32 get_pte(struct lguest *lg, u32 entry, int write)
+{
+	u32 pfn;
+
+	pfn = get_pfn(entry >> PAGE_SHIFT, write);
+	if (pfn == -1UL) {
+		kill_guest(lg, "failed to get page %u", entry>>PAGE_SHIFT);
+		return 0;
+	}
+	return ((pfn << PAGE_SHIFT) | (entry & (PAGE_SIZE-1)));
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+   swapped.  It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return NULL or the pte page. */
+static int page_in(struct lguest *lg, u32 vaddr, unsigned flags)
+{
+	u32 gtop, gpte;
+	u32 *top, *pte, *ptepage;
+	u32 val;
+
+	gtop = gtoplev(lg, vaddr);
+	val = lhread_u32(lg, gtop);
+	if (!(val & _PAGE_PRESENT))
+		return 0;
+
+	top = toplev(lg, lg->pgdidx, vaddr);
+	if (!(*top & _PAGE_PRESENT)) {
+		/* Get a PTE page for them. */
+		ptepage = (void *)get_zeroed_page(GFP_KERNEL);
+		/* FIXME: Steal from self in this case? */
+		if (!ptepage) {
+			kill_guest(lg, "out of memory allocating pte page");
+			return 0;
+		}
+		val = check_pgtable_entry(lg, val);
+		*top = (__pa(ptepage) | (val & (PAGE_SIZE-1)));
+	} else
+		ptepage = __va(*top & PAGE_MASK);
+
+	gpte = gpteof(lg, val, vaddr);
+	val = lhread_u32(lg, gpte);
+
+	/* No page, or write to readonly page? */
+	if (!(val&_PAGE_PRESENT) || ((flags&_PAGE_DIRTY) && !(val&_PAGE_RW)))
+		return 0;
+
+	pte = pteof(lg, *top, vaddr);
+	val = check_pgtable_entry(lg, val) | flags;
+
+	/* We're done with the old pte. */
+	release_pte(*pte);
+
+	/* We don't make it writable if this isn't a write: later
+	 * write will fault so we can set dirty bit in guest. */
+	if (val & _PAGE_DIRTY)
+		*pte = get_pte(lg, val, 1);
+	else
+		*pte = get_pte(lg, val & ~_PAGE_RW, 0);
+
+	/* Now we update dirty/accessed on guest. */
+	lhwrite_u32(lg, gpte, val);
+	return 1;
+}
+
+int demand_page(struct lguest *lg, u32 vaddr, int write)
+{
+	return page_in(lg, vaddr, (write ? _PAGE_DIRTY : 0)|_PAGE_ACCESSED);
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+	unsigned int i;
+	u32 stack = lg->state->tss.esp1;
+
+	for (i = 0; i < lg->stack_pages; i++)
+		if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+			kill_guest(lg, "bad stack page %i@%#x", i, stack);
+}
+
+static unsigned int find_pgdir(struct lguest *lg, u32 pgtable)
+{
+	unsigned int i;
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].cr3 == pgtable)
+			break;
+	return i;
+}
+
+static void release_pgd(struct lguest *lg, u32 *pgd)
+{
+	if (*pgd & _PAGE_PRESENT) {
+		unsigned int i;
+		u32 *ptepage = __va(*pgd & ~(PAGE_SIZE-1));
+		for (i = 0; i < PTES_PER_PAGE; i++)
+			release_pte(ptepage[i]);
+		free_page((long)ptepage);
+		*pgd = 0;
+	}
+}
+
+static void flush_user_mappings(struct lguest *lg, int idx)
+{
+	unsigned int i;
+	for (i = 0; i < vaddr_to_pgd(lg->page_offset); i++)
+		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+}
+
+void guest_pagetable_flush_user(struct lguest *lg)
+{
+	flush_user_mappings(lg, lg->pgdidx);
+}
+
+static unsigned int new_pgdir(struct lguest *lg, u32 cr3)
+{
+	unsigned int next;
+
+	next = (lg->pgdidx + random32()) % ARRAY_SIZE(lg->pgdirs);
+	if (!lg->pgdirs[next].pgdir) {
+		lg->pgdirs[next].pgdir = (u32 *)get_zeroed_page(GFP_KERNEL);
+		if (!lg->pgdirs[next].pgdir)
+			next = lg->pgdidx;
+	}
+	lg->pgdirs[next].cr3 = cr3;
+	/* Release all the non-kernel mappings. */
+	flush_user_mappings(lg, next);
+
+	return next;
+}
+
+void guest_new_pagetable(struct lguest *lg, u32 pgtable)
+{
+	int newpgdir;
+
+	newpgdir = find_pgdir(lg, pgtable);
+	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+		newpgdir = new_pgdir(lg, pgtable);
+	lg->pgdidx = newpgdir;
+	lg->state->regs.cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
+	pin_stack_pages(lg);
+}
+
+static void release_all_pagetables(struct lguest *lg)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].pgdir)
+			for (j = 0; j < HYPERVISOR_PGD_ENTRY; j++)
+				release_pgd(lg, lg->pgdirs[i].pgdir + j);
+}
+
+void guest_pagetable_clear_all(struct lguest *lg)
+{
+	release_all_pagetables(lg);
+	pin_stack_pages(lg);
+}
+
+static void do_set_pte(struct lguest *lg, int idx,
+		       unsigned long vaddr, u32 val)
+{
+	u32 *top = toplev(lg, idx, vaddr);
+	if (*top & _PAGE_PRESENT) {
+		u32 *pte = pteof(lg, *top, vaddr);
+		release_pte(*pte);
+		if (val & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+			val = check_pgtable_entry(lg, val);
+			*pte = get_pte(lg, val, val & _PAGE_DIRTY);
+		} else
+			*pte = 0;
+	}
+}
+
+void guest_set_pte(struct lguest *lg,
+		   unsigned long cr3, unsigned long vaddr, u32 val)
+{
+	/* Kernel mappings must be changed on all top levels. */
+	if (vaddr >= lg->page_offset) {
+		unsigned int i;
+		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+			if (lg->pgdirs[i].pgdir)
+				do_set_pte(lg, i, vaddr, val);
+	} else {
+		int pgdir = find_pgdir(lg, cr3);
+		if (pgdir != ARRAY_SIZE(lg->pgdirs))
+			do_set_pte(lg, pgdir, vaddr, val);
+	}
+}
+
+void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 idx)
+{
+	int pgdir;
+
+	if (idx >= HYPERVISOR_PGD_ENTRY)
+		return;
+
+	pgdir = find_pgdir(lg, cr3);
+	if (pgdir < ARRAY_SIZE(lg->pgdirs))
+		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+}
+
+int init_guest_pagetable(struct lguest *lg, u32 pgtable)
+{
+	/* We assume this in flush_user_mappings, so check now */
+	if (vaddr_to_pgd(lg->page_offset) >= HYPERVISOR_PGD_ENTRY)
+		return -EINVAL;
+	lg->pgdidx = 0;
+	lg->pgdirs[lg->pgdidx].cr3 = pgtable;
+	lg->pgdirs[lg->pgdidx].pgdir = (u32*)get_zeroed_page(GFP_KERNEL);
+	if (!lg->pgdirs[lg->pgdidx].pgdir)
+		return -ENOMEM;
+	return 0;
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+	unsigned int i;
+
+	release_all_pagetables(lg);
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_trap_page(struct lguest *lg)
+{
+	int cpu = smp_processor_id();
+	
+	hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
+
+	/* Since hypervisor less that 4MB, we simply mug top pte page. */
+	lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =
+		(__pa(hypervisor_pte_page(cpu))| _PAGE_KERNEL);
+}
+
+static void free_hypervisor_pte_pages(void)
+{
+	int i;
+	
+	for_each_possible_cpu(i)
+		free_page((long)hypervisor_pte_page(i));
+}
+
+static __init int alloc_hypervisor_pte_pages(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		hypervisor_pte_page(i) = (u32 *)get_zeroed_page(GFP_KERNEL);
+		if (!hypervisor_pte_page(i)) {
+			free_hypervisor_pte_pages();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+static __init void populate_hypervisor_pte_page(int cpu)
+{
+	int i;
+	u32 *pte = hypervisor_pte_page(cpu);
+
+	for (i = 0; i < HYPERVISOR_PAGES; i++) {
+		/* First entry set dynamically in map_trap_page */
+		pte[i+1] = ((page_to_pfn(&hype_pages[i]) << PAGE_SHIFT) 
+			    | _PAGE_KERNEL_EXEC);
+	}
+}
+
+__init int init_pagetables(struct page hype_pages[])
+{
+	int ret;
+	unsigned int i;
+
+	ret = alloc_hypervisor_pte_pages();
+	if (ret)
+		return ret;
+
+	for_each_possible_cpu(i)
+		populate_hypervisor_pte_page(i);
+	return 0;
+}
+
+__exit void free_pagetables(void)
+{
+	free_hypervisor_pte_pages();
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/segments.c
@@ -0,0 +1,171 @@
+#include "lg.h"
+
+/* Dealing with GDT entries is such a horror, I convert to sanity and back */
+struct decoded_gdt_entry
+{
+	u32 base, limit;
+	union {
+		struct {
+			unsigned type:4;
+			unsigned dtype:1;
+			unsigned dpl:2;
+			unsigned present:1;
+			unsigned unused:4;
+			unsigned avl:1;
+			unsigned mbz:1;
+			unsigned def:1;
+			unsigned page_granularity:1;
+		};
+		u16 raw_attributes;
+	};
+};
+
+static struct decoded_gdt_entry decode_gdt_entry(const struct desc_struct *en)
+{
+	struct decoded_gdt_entry de;
+	de.base = ((en->a >> 16) | ((en->b & 0xff) << 16) 
+		   | (en->b & 0xFF000000));
+	de.limit = ((en->a & 0xFFFF) | (en->b & 0xF0000));
+	de.raw_attributes = (en->b >> 8);
+	return de;
+}
+
+static struct desc_struct encode_gdt_entry(const struct decoded_gdt_entry *de)
+{
+	struct desc_struct en;
+	en.a = ((de->limit & 0xFFFF) | (de->base << 16));
+	en.b = (((de->base >> 16) & 0xFF) 
+		 | ((((u32)de->raw_attributes) & 0xF0FF) << 8)
+		 | (de->limit & 0xF0000)
+		 | (de->base & 0xFF000000));
+	return en;
+}
+
+static int check_desc(const struct decoded_gdt_entry *dec)
+{
+	return (dec->mbz == 0 && dec->dtype == 1 && (dec->type & 4) == 0);
+}
+
+static void check_segment(const struct desc_struct *gdt, u32 *segreg)
+{
+	if (*segreg > 255 || !(gdt[*segreg >> 3].b & 0x8000))
+		*segreg = 0;
+}
+
+/* Ensure our manually-loaded segment regs don't fault in switch_to_guest. */
+static void check_live_segments(const struct desc_struct *gdt,
+				struct lguest_regs *regs)
+{
+	check_segment(gdt, &regs->es);
+	check_segment(gdt, &regs->ds);
+	check_segment(gdt, &regs->fs);
+	check_segment(gdt, &regs->gs);
+}
+
+int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
+		    struct lguest_regs *regs, struct x86_tss *tss)
+{
+	unsigned int i;
+	struct decoded_gdt_entry dec;
+
+	for (i = 0; i < num; i++) {
+		unsigned long base, length;
+
+		/* We override these ones, so we don't care what they give. */
+		if (i == GDT_ENTRY_TSS
+		    || i == GDT_ENTRY_LGUEST_CS
+		    || i == GDT_ENTRY_LGUEST_DS
+		    || i == GDT_ENTRY_DOUBLEFAULT_TSS)
+			continue;
+
+		dec = decode_gdt_entry(&gdt[i]);
+		if (!dec.present)
+			continue;
+
+		if (!check_desc(&dec))
+			return 0;
+
+		base = dec.base;
+		length = dec.limit + 1;
+		if (dec.page_granularity) {
+			base *= PAGE_SIZE;
+			length *= PAGE_SIZE;
+		}
+
+		/* Unacceptable base? */
+		if (base >= HYPE_ADDR)
+			return 0;
+
+		/* Wrap around or segment overlaps hypervisor mem? */
+		if (!length
+		    || base + length < base
+		    || base + length > HYPE_ADDR) {
+			/* Trim to edge of hypervisor. */
+			length = HYPE_ADDR - base;
+			if (dec.page_granularity)
+				dec.limit = (length / PAGE_SIZE) - 1;
+			else
+				dec.limit = length - 1;
+		}
+		if (dec.dpl == 0)
+			dec.dpl = GUEST_DPL;
+		gdt[i] = encode_gdt_entry(&dec);
+	}
+	check_live_segments(gdt, regs);
+
+	/* Now put in hypervisor data and code segments. */
+	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+
+	/* Finally, TSS entry */
+	dec.base = (unsigned long)tss;
+	dec.limit = sizeof(*tss)-1;
+	dec.type = 0x9;
+	dec.dtype = 0;
+	dec.def = 0;
+	dec.present = 1;
+	dec.mbz = 0;
+	dec.page_granularity = 0;
+	gdt[GDT_ENTRY_TSS] = encode_gdt_entry(&dec);
+
+	return 1;
+}
+
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num)
+{
+	if (num > GDT_ENTRIES)
+		kill_guest(lg, "too many gdt entries %i", num);
+
+	lhread(lg, lg->state->gdt_table, table,
+	       num * sizeof(lg->state->gdt_table[0]));
+	if (!fixup_gdt_table(lg->state->gdt_table, num, 
+			     &lg->state->regs, &lg->state->tss))
+		kill_guest(lg, "bad gdt table");
+}
+
+/* We don't care about limit here, since we only let them use these in
+ * usermode (where lack of USER bit in pagetable protects hypervisor mem).
+ * However, we want to ensure it doesn't fault when loaded, since *we* are
+ * the ones who will load it in switch_to_guest.
+ */
+void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls)
+{
+	unsigned int i;
+	struct desc_struct *tls = &lg->state->gdt_table[GDT_ENTRY_TLS_MIN];
+
+	lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+	for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++) {
+		struct decoded_gdt_entry dec = decode_gdt_entry(&tls[i]);
+
+		if (!dec.present)
+			continue;
+
+		/* We truncate to one byte/page (depending on G bit) to neuter
+		   it, so ensure it's more than 1 page below trap page. */
+		tls[i].a &= 0xFFFF0000;
+		lg->tls_limits[i] = dec.limit;
+		if (!check_desc(&dec) || dec.base > HYPE_ADDR - PAGE_SIZE)
+			kill_guest(lg, "bad TLS descriptor %i", i);
+	}
+	check_live_segments(lg->state->gdt_table, &lg->state->regs);
+}
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest.h
@@ -0,0 +1,86 @@
+/* Things the lguest guest needs to know. */
+#ifndef _ASM_LGUEST_H
+#define _ASM_LGUEST_H
+
+#define LGUEST_MAGIC_EBP 0x4C687970
+#define LGUEST_MAGIC_EDI 0x652D4D65
+#define LGUEST_MAGIC_ESI 0xFFFFFFFF
+
+#define LHCALL_FLUSH_ASYNC	0
+#define LHCALL_LGUEST_INIT	1
+#define LHCALL_CRASH		2
+#define LHCALL_LOAD_GDT		3
+#define LHCALL_NEW_PGTABLE	4
+#define LHCALL_FLUSH_TLB	5
+#define LHCALL_LOAD_IDT_ENTRY	6
+#define LHCALL_SET_STACK	7
+#define LHCALL_TS		8
+#define LHCALL_TIMER_READ	9
+#define LHCALL_TIMER_START	10
+#define LHCALL_HALT		11
+#define LHCALL_GET_WALLCLOCK	12
+#define LHCALL_BIND_DMA		13
+#define LHCALL_SEND_DMA		14
+#define LHCALL_SET_PTE		15
+#define LHCALL_SET_UNKNOWN_PTE	16
+#define LHCALL_SET_PUD		17
+#define LHCALL_LOAD_TLS		18
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+		     : "=a"(call)
+		     : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 
+		     : "memory");
+	return call;
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+#define LGUEST_IRQS 32
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+	u32 eax, edx, ebx, ecx;
+};
+
+/* All the good stuff happens here: guest registers it with LGUEST_INIT */
+struct lguest_data
+{
+/* Fields which change during running: */
+	/* 512 == enabled (same as eflags) */
+	unsigned int irq_enabled;
+	/* Blocked interrupts. */
+	DECLARE_BITMAP(interrupts, LGUEST_IRQS); 
+
+	/* Last (userspace) address we got a GPF & reloaded gs. */
+	unsigned int gs_gpf_eip;
+
+	/* Virtual address of page fault. */
+	unsigned long cr2;
+
+	/* Async hypercall ring.  0xFF == done, 0 == pending. */
+	u8 hcall_status[LHCALL_RING_SIZE];
+	struct hcall_ring hcalls[LHCALL_RING_SIZE];
+			
+/* Fields initialized by the hypervisor at boot: */
+	/* Memory not to try to access */
+	unsigned long reserve_mem;
+	/* ID of this guest (used by network driver to set ethernet address) */
+	u16 guestid;
+	/* Multiplier for TSC clock. */
+	u32 clock_mult;
+
+/* Fields initialized by the guest at boot: */
+	/* Instruction range to suppress interrupts even if enabled */
+	unsigned long noirq_start, noirq_end;
+};
+extern struct lguest_data lguest_data;
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+#endif	/* _ASM_LGUEST_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest_device.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_LGUEST_DEVICE_H
+#define _ASM_LGUEST_DEVICE_H
+/* Everything you need to know about lguest devices. */
+#include <linux/device.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+
+struct lguest_device {
+	/* Unique busid, and index into lguest_page->devices[] */
+	/* By convention, each device can use irq index+1 if it wants to. */
+	unsigned int index;
+
+	struct device dev;
+
+	/* Driver can hang data off here. */
+	void *private;
+};
+
+struct lguest_driver {
+	const char *name;
+	struct module *owner;
+	u16 device_type;
+	int (*probe)(struct lguest_device *dev);
+	void (*remove)(struct lguest_device *dev);
+
+	struct device_driver drv;
+};
+
+extern int register_lguest_driver(struct lguest_driver *drv);
+extern void unregister_lguest_driver(struct lguest_driver *drv);
+#endif /* _ASM_LGUEST_DEVICE_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest_user.h
@@ -0,0 +1,86 @@
+#ifndef _ASM_LGUEST_USER
+#define _ASM_LGUEST_USER
+/* Everything the "lguest" userspace program needs to know. */
+/* They can register up to 32 arrays of lguest_dma. */
+#define LGUEST_MAX_DMA		32
+/* At most we can dma 16 lguest_dma in one op. */
+#define LGUEST_MAX_DMA_SECTIONS	16
+
+/* How many devices?  Assume each one wants up to two dma arrays per device. */
+#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
+
+struct lguest_dma
+{
+	/* 0 if free to be used, filled by hypervisor. */
+ 	u32 used_len;
+	u32 addr[LGUEST_MAX_DMA_SECTIONS];
+	u16 len[LGUEST_MAX_DMA_SECTIONS];
+};
+
+/* This is found at address 0. */
+struct lguest_boot_info
+{
+	u32 max_pfn;
+	u32 initrd_size;
+	char cmdline[256];
+};
+
+struct lguest_block_page
+{
+	/* 0 is a read, 1 is a write. */
+	int type;
+	u32 sector; 	/* Offset in device = sector * 512. */
+	u32 bytes;	/* Length expected to be read/written in bytes */
+	/* 0 = pending, 1 = done, 2 = done, error */
+	int result;
+	u32 num_sectors; /* Disk length = num_sectors * 512 */
+};
+
+/* There is a shared page of these. */
+struct lguest_net
+{
+	union {
+		unsigned char mac[6];
+		struct {
+			u8 promisc;
+			u8 pad;
+			u16 guestid;
+		};
+	};
+};
+
+/* lguest_device_desc->type */
+#define LGUEST_DEVICE_T_CONSOLE	1
+#define LGUEST_DEVICE_T_NET	2
+#define LGUEST_DEVICE_T_BLOCK	3
+
+/* lguest_device_desc->status.  256 and above are device specific. */
+#define LGUEST_DEVICE_S_ACKNOWLEDGE	1 /* We have seen device. */
+#define LGUEST_DEVICE_S_DRIVER		2 /* We have found a driver */
+#define LGUEST_DEVICE_S_DRIVER_OK	4 /* Driver says OK! */
+#define LGUEST_DEVICE_S_REMOVED		8 /* Device has gone away. */
+#define LGUEST_DEVICE_S_REMOVED_ACK	16 /* Driver has been told. */
+#define LGUEST_DEVICE_S_FAILED		128 /* Something actually failed */
+
+#define LGUEST_NET_F_NOCSUM		0x4000 /* Don't bother checksumming */
+#define LGUEST_DEVICE_F_RANDOMNESS	0x8000 /* IRQ is fairly random */
+
+/* We have a page of these descriptors in the lguest_device page. */
+struct lguest_device_desc {
+	u16 type;
+	u16 features;
+	u16 status;
+	u16 num_pages;
+	u32 pfn;
+};
+
+/* Write command first word is a request. */
+enum lguest_req
+{
+	LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
+	LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
+	LHREQ_IRQ, /* + irq */
+};
+
+
+#endif /* _ASM_LGUEST_USER */




[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux