[RFC/PATCH LGUEST X86_64 07/13] lguest64 loader

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



plain text document attachment (lguest64-loader.patch)
I noticed that the lguest loader code for i386 was in
Documentation/lguest.  Well, that's fine (I guess) but
it can't just be for i386.  So I made a separate directory
to put the loader code in.  So now we have:

 Documentation/lguest/i386/... for the lguest i386 loader.

and
 Documentation/lguest/x86_64/... for the lguest x86_64 loader.

Signed-off-by: Steven Rostedt <srostedt@xxxxxxxxxx>
Signed-off-by: Glauber de Oliveira Costa <glommer@xxxxxxxxx>
Cc: Chris Wright <chrisw@xxxxxxxxxxxx>


Index: work-pv/Documentation/lguest/i386/Makefile
===================================================================
--- /dev/null
+++ work-pv/Documentation/lguest/i386/Makefile
@@ -0,0 +1,21 @@
+# This creates the demonstration utility "lguest" which runs a Linux guest.
+
+# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
+# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+include ../../../.config
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
+
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
+	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+LDLIBS:=-lz
+
+all: lguest.lds lguest
+
+# The linker script on x86 is so complex the only way of creating one
+# which will link our binary in the right place is to mangle the
+# default one.
+lguest.lds:
+	$(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
+
+clean:
+	rm -f lguest.lds lguest
Index: work-pv/Documentation/lguest/i386/lguest.c
===================================================================
--- /dev/null
+++ work-pv/Documentation/lguest/i386/lguest.c
@@ -0,0 +1,1039 @@
+/* Simple program to layout "physical" memory for new lguest guest.
+ * Linked high to avoid likely physical memory.  */
+#define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <err.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <elf.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <linux/sockios.h>
+#include <linux/if_tun.h>
+#include <sys/uio.h>
+#include <termios.h>
+#include <zlib.h>
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+#include "../../../include/asm/lguest_user.h"
+
+#define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
+#define NET_PEERNUM 1
+#define BRIDGE_PFX "bridge:"
+
+static bool verbose;
+#define verbose(args...) \
+	do { if (verbose) printf(args); fflush(stdout); } while(0)
+
+struct devices
+{
+	fd_set infds;
+	int max_infd;
+
+	struct device *dev;
+};
+
+struct device
+{
+	struct device *next;
+	struct lguest_device_desc *desc;
+	void *mem;
+
+	/* Watch this fd if handle_input non-NULL. */
+	int fd;
+	int (*handle_input)(int fd, struct device *me);
+
+	/* Watch DMA to this address if handle_input non-NULL. */
+	unsigned long watch_address;
+	u32 (*handle_output)(int fd, const struct iovec *iov,
+			     unsigned int num, struct device *me);
+
+	/* Device-specific data. */
+	void *priv;
+};
+
+static char buf[1024];
+static struct iovec discard_iov = { .iov_base=buf, .iov_len=sizeof(buf) };
+static int zero_fd;
+
+/* LGUEST_GUEST_TOP defined in Makefile, just below us.
+   FIXME: vdso gets mapped just under it, and we need to protect that. */
+#define RESERVE_TOP LGUEST_GUEST_TOP - 1024*1024
+
+static u32 memparse(const char *ptr)
+{
+	char *end;
+	unsigned long ret = strtoul(ptr, &end, 0);
+
+	switch (*end) {
+	case 'G':
+	case 'g':
+		ret <<= 10;
+	case 'M':
+	case 'm':
+		ret <<= 10;
+	case 'K':
+	case 'k':
+		ret <<= 10;
+		end++;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static inline unsigned long page_align(unsigned long addr)
+{
+	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
+}
+
+/* initrd gets loaded at top of memory: return length. */
+static unsigned long load_initrd(const char *name, unsigned long end)
+{
+	int ifd;
+	struct stat st;
+	void *iaddr;
+
+	if (!name)
+		return 0;
+
+	ifd = open(name, O_RDONLY, 0);
+	if (ifd < 0)
+		err(1, "Opening initrd '%s'", name);
+
+	if (fstat(ifd, &st) < 0)
+		err(1, "fstat() on initrd '%s'", name);
+
+	iaddr = mmap((void *)end - st.st_size, st.st_size,
+		     PROT_READ|PROT_EXEC|PROT_WRITE,
+		     MAP_FIXED|MAP_PRIVATE, ifd, 0);
+	if (iaddr != (void *)end - st.st_size)
+		err(1, "Mmaping initrd '%s' returned %p not %p",
+		    name, iaddr, (void *)end - st.st_size);
+	close(ifd);
+	verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+	return st.st_size;
+}
+
+/* First map /dev/zero over entire memory, then insert kernel. */
+static void map_memory(unsigned long mem)
+{
+	if (mmap(0, mem,
+		 PROT_READ|PROT_WRITE|PROT_EXEC,
+		 MAP_FIXED|MAP_PRIVATE, zero_fd, 0) != (void *)0)
+		err(1, "Mmaping /dev/zero for %li bytes", mem);
+}
+
+static u32 finish(unsigned long mem, unsigned long *page_offset,
+		  const char *initrd, unsigned long *ird_size)
+{
+	u32 *pgdir = NULL, *linear = NULL;
+	int i, pte_pages;
+
+	/* This is a top of mem. */
+	*ird_size = load_initrd(initrd, mem);
+
+	/* Below initrd is used as top level of pagetable. */
+	pte_pages = 1 + (mem/getpagesize() + 1023)/1024;
+
+	pgdir = (u32 *)page_align(mem - *ird_size - pte_pages*getpagesize());
+	linear = (void *)pgdir + getpagesize();
+
+	/* Linear map all of memory at page_offset (to top of mem). */
+	if (mem > -*page_offset)
+		mem = -*page_offset;
+
+	for (i = 0; i < mem / getpagesize(); i++)
+		linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
+	verbose("Linear %p-%p (%i-%i) = %#08x-%#08x\n",
+		linear, linear+i-1, 0, i-1, linear[0], linear[i-1]);
+
+	/* Now set up pgd so that this memory is at page_offset */
+	for (i = 0; i < mem / getpagesize(); i += getpagesize()/sizeof(u32)) {
+		pgdir[(i + *page_offset/getpagesize())/1024]
+			= (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
+		verbose("Top level %lu = %#08x\n",
+			(i + *page_offset/getpagesize())/1024,
+			pgdir[(i + *page_offset/getpagesize())/1024]);
+	}
+
+	return (unsigned long)pgdir;
+}
+
+/* Returns the entry point */
+static u32 map_elf(int elf_fd, const Elf32_Ehdr *ehdr, unsigned long mem,
+		   unsigned long *pgdir_addr,
+		   const char *initrd, unsigned long *ird_size,
+		   unsigned long *page_offset)
+{
+	void *addr;
+	Elf32_Phdr phdr[ehdr->e_phnum];
+	unsigned int i;
+
+	/* Sanity checks. */
+	if (ehdr->e_type != ET_EXEC
+	    || ehdr->e_machine != EM_386
+	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
+	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+		errx(1, "Malformed elf header");
+
+	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
+		err(1, "Seeking to program headers");
+	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
+		err(1, "Reading program headers");
+
+	map_memory(mem);
+
+	*page_offset = 0;
+	/* We map the loadable segments at virtual addresses corresponding
+	 * to their physical addresses (our virtual == guest physical). */
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		if (phdr[i].p_type != PT_LOAD)
+			continue;
+
+		verbose("Section %i: size %i addr %p\n",
+			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+		/* We map everything private, writable. */
+		if (phdr[i].p_paddr + phdr[i].p_memsz > mem)
+			errx(1, "Segment %i overlaps end of memory", i);
+
+		/* We expect linear address space. */
+		if (!*page_offset)
+			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
+		else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+			errx(1, "Page offset of section %i different", i);
+
+		/* Recent ld versions don't page align any more. */
+		if (phdr[i].p_paddr % getpagesize()) {
+			phdr[i].p_filesz += (phdr[i].p_paddr % getpagesize());
+			phdr[i].p_offset -= (phdr[i].p_paddr % getpagesize());
+			phdr[i].p_paddr -= (phdr[i].p_paddr % getpagesize());
+		}
+		addr = mmap((void *)phdr[i].p_paddr,
+			    phdr[i].p_filesz,
+			    PROT_READ|PROT_WRITE|PROT_EXEC,
+			    MAP_FIXED|MAP_PRIVATE,
+			    elf_fd, phdr[i].p_offset);
+		if (addr != (void *)phdr[i].p_paddr)
+			err(1, "Mmaping vmlinux segment %i returned %p not %p (%p)",
+			    i, addr, (void *)phdr[i].p_paddr, &phdr[i].p_paddr);
+	}
+
+	*pgdir_addr = finish(mem, page_offset, initrd, ird_size);
+	/* Entry is physical address: convert to virtual */
+	return ehdr->e_entry + *page_offset;
+}
+
+static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
+{
+	unsigned int i, possibilities[256];
+
+	for (i = 0; i + 4 < len; i++) {
+		/* mov 0xXXXXXXXX,%eax */
+		if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
+			return (unsigned long)img[i+4] << 24;
+	}
+	errx(1, "could not determine page offset");
+}
+
+static u32 bzimage(int fd, unsigned long mem, unsigned long *pgdir_addr,
+		   const char *initrd, unsigned long *ird_size,
+		   unsigned long *page_offset)
+{
+	gzFile f;
+	int ret, len = 0;
+	void *img = (void *)0x100000;
+
+	map_memory(mem);
+
+	f = gzdopen(fd, "rb");
+	if (gzdirect(f))
+		errx(1, "did not find correct gzip header");
+	while ((ret = gzread(f, img + len, 65536)) > 0)
+		len += ret;
+	if (ret < 0)
+		err(1, "reading image from bzImage");
+
+	verbose("Unpacked size %i addr %p\n", len, img);
+	*page_offset = intuit_page_offset(img, len);
+	*pgdir_addr = finish(mem, page_offset, initrd, ird_size);
+
+	/* Entry is physical address: convert to virtual */
+	return (u32)img + *page_offset;
+}
+
+static u32 load_bzimage(int bzimage_fd, const Elf32_Ehdr *ehdr,
+			unsigned long mem, unsigned long *pgdir_addr,
+			const char *initrd, unsigned long *ird_size,
+			unsigned long *page_offset)
+{
+	unsigned char c;
+	int state = 0;
+
+	/* Just brute force it. */
+	while (read(bzimage_fd, &c, 1) == 1) {
+		switch (state) {
+		case 0:
+			if (c == 0x1F)
+				state++;
+			break;
+		case 1:
+			if (c == 0x8B)
+				state++;
+			else
+				state = 0;
+			break;
+		case 2 ... 8:
+			state++;
+			break;
+		case 9:
+			lseek(bzimage_fd, -10, SEEK_CUR);
+			if (c != 0x03) /* Compressed under UNIX. */
+				state = -1;
+			else
+				return bzimage(bzimage_fd, mem, pgdir_addr,
+					       initrd, ird_size, page_offset);
+		}
+	}
+	errx(1, "Could not find kernel in bzImage");
+}
+
+static void *map_pages(unsigned long addr, unsigned int num)
+{
+	if (mmap((void *)addr, getpagesize() * num,
+		 PROT_READ|PROT_WRITE|PROT_EXEC,
+		 MAP_FIXED|MAP_PRIVATE, zero_fd, 0) != (void *)addr)
+		err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
+	return (void *)addr;
+}
+
+static struct lguest_device_desc *
+get_dev_entry(struct lguest_device_desc *descs, u16 type, u16 num_pages)
+{
+	static unsigned long top = RESERVE_TOP;
+	int i;
+	unsigned long pfn = 0;
+
+	if (num_pages) {
+		top -= num_pages*getpagesize();
+		map_pages(top, num_pages);
+		pfn = top / getpagesize();
+	}
+
+	for (i = 0; i < LGUEST_MAX_DEVICES; i++) {
+		if (!descs[i].type) {
+			descs[i].features = descs[i].status = 0;
+			descs[i].type = type;
+			descs[i].num_pages = num_pages;
+			descs[i].pfn = pfn;
+			return &descs[i];
+		}
+	}
+	errx(1, "too many devices");
+}
+
+static void set_fd(int fd, struct devices *devices)
+{
+	FD_SET(fd, &devices->infds);
+	if (fd > devices->max_infd)
+		devices->max_infd = fd;
+}
+
+static struct device *new_device(struct devices *devices,
+				 struct lguest_device_desc *descs,
+				 u16 type, u16 num_pages,
+				 int fd,
+				 int (*handle_input)(int, struct device *),
+				 unsigned long watch_off,
+				 u32 (*handle_output)(int,
+						      const struct iovec *,
+						      unsigned,
+						      struct device *))
+{
+	struct device *dev = malloc(sizeof(*dev));
+
+	dev->next = devices->dev;
+	devices->dev = dev;
+
+	dev->fd = fd;
+	if (handle_input)
+		set_fd(dev->fd, devices);
+	dev->desc = get_dev_entry(descs, type, num_pages);
+	dev->mem = (void *)(dev->desc->pfn * getpagesize());
+	dev->handle_input = handle_input;
+	dev->watch_address = (unsigned long)dev->mem + watch_off;
+	dev->handle_output = handle_output;
+	return dev;
+}
+
+static int tell_kernel(u32 pagelimit, u32 pgdir, u32 start, u32 page_offset)
+{
+	u32 args[] = { LHREQ_INITIALIZE,
+		       pagelimit, pgdir, start, page_offset };
+	int fd = open("/dev/lguest", O_RDWR);
+
+	if (fd < 0)
+		err(1, "Opening /dev/lguest");
+
+	verbose("Telling kernel limit %u, pgdir %i, e=%#08x page_off=0x%08x\n",
+		pagelimit, pgdir, start, page_offset);
+	if (write(fd, args, sizeof(args)) < 0)
+		err(1, "Writing to /dev/lguest");
+	return fd;
+}
+
+static void concat(char *dst, char *args[])
+{
+	unsigned int i, len = 0;
+
+	for (i = 0; args[i]; i++) {
+		strcpy(dst+len, args[i]);
+		strcat(dst+len, " ");
+		len += strlen(args[i]) + 1;
+	}
+	/* In case it's empty. */
+	dst[len] = '\0';
+}
+
+static void *_check_pointer(unsigned long addr, unsigned int size,
+			    unsigned int line)
+{
+	if (addr >= RESERVE_TOP || addr + size >= RESERVE_TOP)
+		errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
+	return (void *)addr;
+}
+#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
+
+/* Returns pointer to dma->used_len */
+static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+{
+	unsigned int i;
+	struct lguest_dma *udma;
+
+	/* No buffers? */
+	if (dma == 0) {
+		printf("no buffers\n");
+		return NULL;
+	}
+
+	udma = check_pointer(dma, sizeof(*udma));
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (!udma->len[i])
+			break;
+
+		iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
+		iov[i].iov_len = udma->len[i];
+	}
+	*num = i;
+	return &udma->used_len;
+}
+
+static u32 *get_dma_buffer(int fd, void *addr,
+			   struct iovec iov[], unsigned *num, u32 *irq)
+{
+	u32 buf[] = { LHREQ_GETDMA, (u32)addr };
+	unsigned long udma;
+	u32 *res;
+
+	udma = write(fd, buf, sizeof(buf));
+	if (udma == (unsigned long)-1)
+		return NULL;
+
+	/* Kernel stashes irq in ->used_len. */
+	res = dma2iov(udma, iov, num);
+	if (res)
+		*irq = *res;
+	return res;
+}
+
+static void trigger_irq(int fd, u32 irq)
+{
+	u32 buf[] = { LHREQ_IRQ, irq };
+	if (write(fd, buf, sizeof(buf)) != 0)
+		err(1, "Triggering irq %i", irq);
+}
+
+static struct termios orig_term;
+static void restore_term(void)
+{
+	tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
+}
+
+struct console_abort
+{
+	int count;
+	struct timeval start;
+};
+
+/* We DMA input to buffer bound at start of console page. */
+static int handle_console_input(int fd, struct device *dev)
+{
+	u32 num, irq = 0, *lenp;
+	int len;
+	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+	struct console_abort *abort = dev->priv;
+
+	lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
+	if (!lenp) {
+		warn("console: no dma buffer!");
+		iov[0] = discard_iov;
+		num = 1;
+	}
+
+	len = readv(dev->fd, iov, num);
+	if (len <= 0) {
+		warnx("Failed to get console input, ignoring console.");
+		len = 0;
+	}
+
+	if (lenp) {
+		*lenp = len;
+		trigger_irq(fd, irq);
+	}
+
+	/* Three ^C within one second?  Exit. */
+	if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
+		if (!abort->count++)
+			gettimeofday(&abort->start, NULL);
+		else if (abort->count == 3) {
+			struct timeval now;
+			gettimeofday(&now, NULL);
+			if (now.tv_sec <= abort->start.tv_sec+1)
+				exit(2);
+			abort->count = 0;
+		}
+	} else
+		abort->count = 0;
+
+	if (!len) {
+		restore_term();
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned long peer_offset(unsigned int peernum)
+{
+	return 4 * peernum;
+}
+
+static u32 handle_tun_output(int fd, const struct iovec *iov,
+			     unsigned num, struct device *dev)
+{
+	/* Now we've seen output, we should warn if we can't get buffers. */
+	*(bool *)dev->priv = true;
+	return writev(dev->fd, iov, num);
+}
+
+static u32 handle_block_output(int fd, const struct iovec *iov,
+			       unsigned num, struct device *dev)
+{
+	struct lguest_block_page *p = dev->mem;
+	u32 irq, reply_num, *lenp;
+	int len;
+	struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
+	off64_t device_len, off = (off64_t)p->sector * 512;
+
+	device_len = *(off64_t *)dev->priv;
+
+	if (off >= device_len)
+		err(1, "Bad offset %llu vs %llu", off, device_len);
+	if (lseek64(dev->fd, off, SEEK_SET) != off)
+		err(1, "Bad seek to sector %i", p->sector);
+
+	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+
+	lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
+	if (!lenp)
+		err(1, "Block request didn't give us a dma buffer");
+
+	if (p->type) {
+		len = writev(dev->fd, iov, num);
+		if (off + len > device_len) {
+			ftruncate(dev->fd, device_len);
+			errx(1, "Write past end %llu+%u", off, len);
+		}
+		*lenp = 0;
+	} else {
+		len = readv(dev->fd, reply, reply_num);
+		*lenp = len;
+	}
+
+	p->result = 1 + (p->bytes != len);
+	trigger_irq(fd, irq);
+	return 0;
+}
+
+#define HIPQUAD(ip)				\
+	((u8)(ip >> 24)),			\
+	((u8)(ip >> 16)),			\
+	((u8)(ip >> 8)),			\
+	((u8)(ip))
+
+static void configure_device(int fd, const char *devname, u32 ipaddr,
+			     unsigned char hwaddr[6])
+{
+	struct ifreq ifr;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+	memset(&ifr, 0, sizeof(ifr));
+	strcpy(ifr.ifr_name, devname);
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = htonl(ipaddr);
+	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+		err(1, "Setting %s interface address", devname);
+	ifr.ifr_flags = IFF_UP;
+	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+		err(1, "Bringing interface %s up", devname);
+
+	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+		err(1, "getting hw address for %s", devname);
+
+	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
+}
+
+/* We send lguest_add signals while input is pending: avoids races. */
+static void wake_parent(int pipefd, struct devices *devices)
+{
+	int parent = getppid();
+	nice(19);
+
+	set_fd(pipefd, devices);
+
+	for (;;) {
+		fd_set rfds = devices->infds;
+
+		select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
+		if (FD_ISSET(pipefd, &rfds)) {
+			int ignorefd;
+			if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
+				exit(0);
+			FD_CLR(ignorefd, &devices->infds);
+		}
+		kill(parent, SIGUSR1);
+	}
+}
+
+/* We don't want signal to kill us, just jerk us out of kernel. */
+static void wakeup(int signo)
+{
+}
+
+static int handle_tun_input(int fd, struct device *dev)
+{
+	u32 irq = 0, num, *lenp;
+	int len;
+	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+
+	lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
+			      &irq);
+	if (!lenp) {
+		if (*(bool *)dev->priv)
+			warn("network: no dma buffer!");
+		iov[0] = discard_iov;
+		num = 1;
+	}
+
+	len = readv(dev->fd, iov, num);
+	if (len <= 0)
+		err(1, "reading network");
+	if (lenp) {
+		*lenp = len;
+		trigger_irq(fd, irq);
+	}
+	verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
+		((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
+		lenp ? "sent" : "discarded");
+	return 1;
+}
+
+/* We use fnctl locks to reserve network slots (autocleanup!) */
+static unsigned int find_slot(int netfd, const char *filename)
+{
+	struct flock fl;
+
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_len = 1;
+	for (fl.l_start = 0;
+	     fl.l_start < getpagesize()/sizeof(struct lguest_net);
+	     fl.l_start++) {
+		if (fcntl(netfd, F_SETLK, &fl) == 0)
+			return fl.l_start;
+	}
+	errx(1, "No free slots in network file %s", filename);
+}
+
+static void setup_net_file(const char *filename,
+			   struct lguest_device_desc *descs,
+			   struct devices *devices)
+{
+	int netfd;
+	struct device *dev;
+
+	netfd = open(filename, O_RDWR, 0);
+	if (netfd < 0) {
+		if (errno == ENOENT) {
+			netfd = open(filename, O_RDWR|O_CREAT, 0600);
+			if (netfd >= 0) {
+				char page[getpagesize()];
+				/* 0xFFFF == NO_GUEST */
+				memset(page, 0xFF, sizeof(page));
+				write(netfd, page, sizeof(page));
+			}
+		}
+		if (netfd < 0)
+			err(1, "cannot open net file '%s'", filename);
+	}
+
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_NET, 1,
+			 -1, NULL, 0, NULL);
+
+	/* This is the slot for the guest to use. */
+	dev->desc->features = find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM;
+	/* We overwrite the /dev/zero mapping with the actual file. */
+	if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
+			 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
+			err(1, "could not mmap '%s'", filename);
+	verbose("device %p@%p: shared net %s, peer %i\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()), filename,
+		dev->desc->features & ~LGUEST_NET_F_NOCSUM);
+}
+
+static u32 str2ip(const char *ipaddr)
+{
+	unsigned int byte[4];
+
+	sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
+	return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
+}
+
+/* adapted from libbridge */
+static void add_to_bridge(int fd, const char *if_name, const char *br_name)
+{
+	int r, ifidx;
+	struct ifreq ifr;
+
+	if (!*br_name)
+		errx(1, "must specify bridge name");
+
+	ifidx = if_nametoindex(if_name);
+	if (!ifidx)
+		errx(1, "interface %s does not exist!\n", if_name);
+
+	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+	ifr.ifr_ifindex = ifidx;
+	r = ioctl(fd, SIOCBRADDIF, &ifr);
+	if (r != -1)
+		return;
+
+	switch (errno) {
+	case ENODEV:
+		errx(1, "bridge %s does not exist!\n", br_name);
+	case EBUSY:
+		errx(1, "device %s is already a member of a bridge; "
+			"can't enslave it to bridge %s.\n", if_name, br_name);
+	case ELOOP:
+		errx(1, "device %s is a bridge device itself; "
+			"can't enslave a bridge device to a bridge device.\n",
+			if_name);
+	default:
+		err(1, "can't add %s to bridge %s\n", if_name, br_name);
+	}
+}
+
+
+static void setup_tun_net(const char *arg,
+			  struct lguest_device_desc *descs,
+			  struct devices *devices)
+{
+	struct device *dev;
+	struct ifreq ifr;
+	int netfd, ipfd;
+	u32 ipaddr;
+	const char *br_name = NULL;
+
+	netfd = open("/dev/net/tun", O_RDWR);
+	if (netfd < 0)
+		err(1, "opening /dev/net/tun");
+
+	memset(&ifr, 0, sizeof(ifr));
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+	strcpy(ifr.ifr_name, "tap%d");
+	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
+		err(1, "configuring /dev/net/tun");
+
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_NET, 1,
+			 netfd, handle_tun_input,
+			 peer_offset(0), handle_tun_output);
+	dev->priv = malloc(sizeof(bool));
+	*(bool *)dev->priv = false;
+
+	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	if (ipfd < 0)
+		err(1, "opening IP socket");
+
+	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
+		ipaddr = INADDR_ANY;
+		br_name = arg + strlen(BRIDGE_PFX);
+		add_to_bridge(ipfd, ifr.ifr_name, br_name);
+	} else
+		ipaddr = str2ip(arg);
+
+	/* We are peer 0, rest is all NO_GUEST */
+	configure_device(ipfd, ifr.ifr_name, ipaddr, dev->mem);
+	close (ipfd);
+
+	/* You will be peer 1: we should create enough jitter to randomize */
+	dev->desc->features = NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS;
+	verbose("device %p@%p: tun net %u.%u.%u.%u\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()),
+		HIPQUAD(ipaddr));
+	if (br_name)
+		verbose("attched to bridge: %s\n", br_name);
+}
+
+static void setup_block_file(const char *filename,
+			     struct lguest_device_desc *descs,
+			     struct devices *devices)
+{
+	int fd;
+	struct device *dev;
+	off64_t *blocksize;
+	struct lguest_block_page *p;
+
+	fd = open(filename, O_RDWR|O_LARGEFILE|O_DIRECT, 0);
+	if (fd < 0)
+		err(1, "Opening %s", filename);
+
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_BLOCK, 1,
+			 fd, NULL, 0, handle_block_output);
+	dev->desc->features = LGUEST_DEVICE_F_RANDOMNESS;
+	blocksize = dev->priv = malloc(sizeof(*blocksize));
+	*blocksize = lseek64(fd, 0, SEEK_END);
+	p = dev->mem;
+
+	p->num_sectors = *blocksize/512;
+	verbose("device %p@%p: block %i sectors\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
+}
+
+static u32 handle_console_output(int fd, const struct iovec *iov,
+				 unsigned num, struct device*dev)
+{
+	return writev(STDOUT_FILENO, iov, num);
+}
+
+static void setup_console(struct lguest_device_desc *descs,
+			  struct devices *devices)
+{
+	struct device *dev;
+
+	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
+		struct termios term = orig_term;
+		term.c_lflag &= ~(ISIG|ICANON|ECHO);
+		tcsetattr(STDIN_FILENO, TCSANOW, &term);
+		atexit(restore_term);
+	}
+
+	/* We don't currently require a page for the console. */
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_CONSOLE, 0,
+			 STDIN_FILENO, handle_console_input,
+			 4, handle_console_output);
+	dev->priv = malloc(sizeof(struct console_abort));
+	((struct console_abort *)dev->priv)->count = 0;
+	verbose("device %p@%p: console\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()));
+}
+
+static const char *get_arg(const char *arg, const char *prefix)
+{
+	if (strncmp(arg, prefix, strlen(prefix)) == 0)
+		return arg + strlen(prefix);
+	return NULL;
+}
+
+static u32 handle_device(int fd, unsigned long dma, unsigned long addr,
+			 struct devices *devices)
+{
+	struct device *i;
+	u32 *lenp;
+	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+	unsigned num = 0;
+
+	lenp = dma2iov(dma, iov, &num);
+	if (!lenp)
+		errx(1, "Bad SEND_DMA %li for address %#lx\n", dma, addr);
+
+	for (i = devices->dev; i; i = i->next) {
+		if (i->handle_output && addr == i->watch_address) {
+			*lenp = i->handle_output(fd, iov, num, i);
+			return 0;
+		}
+	}
+	warnx("Pending dma %p, addr %p", (void *)dma, (void *)addr);
+	return 0;
+}
+
+static void handle_input(int fd, int childfd, struct devices *devices)
+{
+	struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
+
+	for (;;) {
+		struct device *i;
+		fd_set fds = devices->infds;
+
+		if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
+			break;
+
+		for (i = devices->dev; i; i = i->next) {
+			if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+				if (!i->handle_input(fd, i)) {
+					FD_CLR(i->fd, &devices->infds);
+					/* Tell child to ignore it too... */
+					write(childfd, &i->fd, sizeof(i->fd));
+				}
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long mem, pgdir, entry, initrd_size, page_offset;
+	int arg, kern_fd, fd, child, pipefd[2];
+	Elf32_Ehdr hdr;
+	struct sigaction act;
+	sigset_t sigset;
+	struct lguest_device_desc *devdescs;
+	struct devices devices;
+	struct lguest_boot_info *boot = (void *)0;
+	const char *initrd_name = NULL;
+	u32 (*load)(int, const Elf32_Ehdr *ehdr, unsigned long,
+		    unsigned long *, const char *, unsigned long *,
+		    unsigned long *);
+
+	if (argv[1] && strcmp(argv[1], "--verbose") == 0) {
+		verbose = true;
+		argv++;
+		argc--;
+	}
+
+	if (argc < 4)
+		errx(1, "Usage: lguest [--verbose] <mem> vmlinux "
+			"[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)"
+			"|--block=<filename>|--initrd=<filename>]... [args...]");
+
+	zero_fd = open("/dev/zero", O_RDONLY, 0);
+	if (zero_fd < 0)
+		err(1, "Opening /dev/zero");
+
+	mem = memparse(argv[1]);
+	kern_fd = open(argv[2], O_RDONLY, 0);
+	if (kern_fd < 0)
+		err(1, "Opening %s", argv[2]);
+
+	if (read(kern_fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+		err(1, "Reading %s elf header", argv[2]);
+
+	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
+		load = map_elf;
+	else
+		load = load_bzimage;
+
+	devices.max_infd = -1;
+	devices.dev = NULL;
+	FD_ZERO(&devices.infds);
+
+	devdescs = map_pages(mem, 1);
+	arg = 3;
+	while (argv[arg] && argv[arg][0] == '-') {
+		const char *argval;
+
+		if ((argval = get_arg(argv[arg], "--sharenet=")) != NULL)
+			setup_net_file(argval, devdescs, &devices);
+		else if ((argval = get_arg(argv[arg], "--tunnet=")) != NULL)
+			setup_tun_net(argval, devdescs, &devices);
+		else if ((argval = get_arg(argv[arg], "--block=")) != NULL)
+			setup_block_file(argval, devdescs, &devices);
+		else if ((argval = get_arg(argv[arg], "--initrd=")) != NULL)
+			initrd_name = argval;
+		else
+			errx(1, "unknown arg '%s'", argv[arg]);
+		arg++;
+	}
+
+	entry = load(kern_fd, &hdr, mem, &pgdir, initrd_name, &initrd_size,
+		     &page_offset);
+	setup_console(devdescs, &devices);
+
+	concat(boot->cmdline, argv+arg);
+	boot->max_pfn = mem/getpagesize();
+	boot->initrd_size = initrd_size;
+
+	act.sa_handler = wakeup;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	sigaction(SIGUSR1, &act, NULL);
+
+	pipe(pipefd);
+	child = fork();
+	if (child == -1)
+		err(1, "forking");
+
+	if (child == 0) {
+		close(pipefd[1]);
+		wake_parent(pipefd[0], &devices);
+	}
+	close(pipefd[0]);
+
+	sigemptyset(&sigset);
+	sigaddset(&sigset, SIGUSR1);
+	sigprocmask(SIG_BLOCK, &sigset, NULL);
+
+	fd = tell_kernel(RESERVE_TOP/getpagesize(), pgdir, entry, page_offset);
+
+	for (;;) {
+		unsigned long arr[2];
+		int readval;
+
+		sigprocmask(SIG_UNBLOCK, &sigset, NULL);
+		readval = read(fd, arr, sizeof(arr));
+		sigprocmask(SIG_BLOCK, &sigset, NULL);
+
+		switch (readval) {
+		case sizeof(arr):
+			handle_device(fd, arr[0], arr[1], &devices);
+			break;
+		case -1:
+			if (errno == EINTR)
+				break;
+		default:
+			if (errno == ENOENT) {
+				char reason[1024];
+				if (read(fd, reason, sizeof(reason)) > 0)
+					errx(1, "%s", reason);
+			}
+			err(1, "Running guest failed");
+		}
+		handle_input(fd, pipefd[1], &devices);
+	}
+}
Index: work-pv/Documentation/lguest/x86_64/Makefile
===================================================================
--- /dev/null
+++ work-pv/Documentation/lguest/x86_64/Makefile
@@ -0,0 +1,22 @@
+# This creates the demonstration utility "lguest" which runs a Linux guest.
+
+# For now on x86_64 we'll hard code the location of the lguest binary loader.
+# But when we can get a relocatable kernel, we'll have to work to make this
+# dynamic.
+LGUEST_GUEST_TOP := 0x7f000000
+
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
+	-g \
+	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+LDLIBS:=-lz
+
+all: lguest.lds lguest
+
+# The linker script on x86 is so complex the only way of creating one
+# which will link our binary in the right place is to mangle the
+# default one.
+lguest.lds: Makefile
+	$(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
+
+clean:
+	rm -f lguest.lds lguest
Index: work-pv/Documentation/lguest/x86_64/lguest.c
===================================================================
--- /dev/null
+++ work-pv/Documentation/lguest/x86_64/lguest.c
@@ -0,0 +1,1021 @@
+/* Simple program to layout "physical" memory for new lguest guest.
+ * Linked high to avoid likely physical memory.  */
+#define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <err.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <elf.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <linux/if.h>
+#include <linux/if_tun.h>
+#include <asm/vsyscall.h>
+#include <sys/uio.h>
+#include <termios.h>
+#include <zlib.h>
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+#include "../../../include/asm/lguest_user.h"
+
+#define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
+#define NET_PEERNUM 1
+
+static bool verbose;
+#define verbose(args...) \
+	do { if (verbose) printf(args); fflush(stdout); } while(0)
+
+struct devices
+{
+	fd_set infds;
+	int max_infd;
+
+	struct device *dev;
+};
+
+struct device
+{
+	struct device *next;
+	struct lguest_device_desc *desc;
+	void *mem;
+
+	/* Watch this fd if handle_input non-NULL. */
+	int fd;
+	int (*handle_input)(int fd, struct device *me);
+
+	/* Watch DMA to this address if handle_input non-NULL. */
+	unsigned long watch_address;
+	u64 (*handle_output)(int fd, const struct iovec *iov,
+			     unsigned int num, struct device *me);
+
+	/* Device-specific data. */
+	void *priv;
+};
+
+static char buf[1024];
+static struct iovec discard_iov = { .iov_base=buf, .iov_len=sizeof(buf) };
+static int zero_fd;
+
+static u64 memparse(const char *ptr)
+{
+	char *end;
+	unsigned long ret = strtoul(ptr, &end, 0);
+
+	switch (*end) {
+	case 'G':
+	case 'g':
+		ret <<= 10;
+	case 'M':
+	case 'm':
+		ret <<= 10;
+	case 'K':
+	case 'k':
+		ret <<= 10;
+		end++;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static inline unsigned long page_align(unsigned long addr)
+{
+	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
+}
+
+/* initrd gets loaded at top of memory: return length. */
+static unsigned long load_initrd(const char *name, unsigned long end)
+{
+	int ifd;
+	struct stat st;
+	void *iaddr;
+
+	if (!name)
+		return 0;
+
+	ifd = open(name, O_RDONLY, 0);
+	if (ifd < 0)
+		err(1, "Opening initrd '%s'", name);
+
+	if (fstat(ifd, &st) < 0)
+		err(1, "fstat() on initrd '%s'", name);
+
+	iaddr = mmap((void *)end - st.st_size, st.st_size,
+		     PROT_READ|PROT_EXEC|PROT_WRITE,
+		     MAP_FIXED|MAP_PRIVATE, ifd, 0);
+	if (iaddr != (void *)end - st.st_size)
+		err(1, "Mmaping initrd '%s' returned %p not %p",
+		    name, iaddr, (void *)end - st.st_size);
+	close(ifd);
+	verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+	return st.st_size;
+}
+
+/* First map /dev/zero over entire memory, then insert kernel. */
+static void map_memory(unsigned long mem)
+{
+	if (mmap(0, mem,
+		 PROT_READ|PROT_WRITE|PROT_EXEC,
+		 MAP_FIXED|MAP_PRIVATE, zero_fd, 0) != (void *)0)
+		err(1, "Mmaping /dev/zero for %li bytes", mem);
+}
+
+/* Returns the entry point */
+static u64 map_elf(int elf_fd, const Elf64_Ehdr *ehdr, unsigned long mem,
+		   unsigned long *pgdir_addr,
+		   const char *initrd, unsigned long *ird_size,
+		   u64 *page_offset)
+{
+	void *addr;
+	Elf64_Phdr phdr[ehdr->e_phnum];
+	unsigned int i;
+	Elf64_Shdr sec[ehdr->e_shnum];
+	Elf64_Sym *syms;
+	char *strtab = NULL;
+	unsigned long nsyms = 0;
+
+	/* Sanity checks. */
+	if (ehdr->e_type != ET_EXEC
+	    || ehdr->e_machine != EM_X86_64
+	    || ehdr->e_phentsize != sizeof(Elf64_Phdr)
+	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf64_Phdr))
+		errx(1, "Malformed elf header");
+
+	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
+		err(1, "Seeking to program headers");
+	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
+		err(1, "Reading program headers");
+
+	map_memory(mem);
+
+	*page_offset = 0;
+	/* We map the loadable segments at virtual addresses corresponding
+	 * to their physical addresses (our virtual == guest physical). */
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		if (phdr[i].p_type != PT_LOAD)
+			continue;
+
+		verbose("Section %i: size %li addr %p\n",
+			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+		/* We map everything private, writable. */
+		if (phdr[i].p_paddr + phdr[i].p_memsz > mem)
+			errx(1, "Segment %i overlaps end of memory", i);
+
+		/* We expect linear address space. */
+		if (!*page_offset)
+			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
+		else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) &&
+			 phdr[i].p_vaddr != VSYSCALL_START)
+			errx(1, "Page offset of section %i different (got %lx, expected %lx)",
+			     i, (phdr[i].p_vaddr - phdr[i].p_paddr), *page_offset);
+
+		/* Recent ld versions don't page align any more. */
+		if (phdr[i].p_paddr % getpagesize()) {
+			phdr[i].p_filesz += (phdr[i].p_paddr % getpagesize());
+			phdr[i].p_offset -= (phdr[i].p_paddr % getpagesize());
+			phdr[i].p_paddr -= (phdr[i].p_paddr % getpagesize());
+		}
+		addr = mmap((void *)phdr[i].p_paddr,
+			    phdr[i].p_filesz,
+			    PROT_READ|PROT_WRITE|PROT_EXEC,
+			    MAP_FIXED|MAP_PRIVATE,
+			    elf_fd, phdr[i].p_offset);
+		if (addr != (void *)phdr[i].p_paddr)
+			err(1, "Mmaping vmlinux segment %i returned %p not %p (%p)",
+			    i, addr, (void *)phdr[i].p_paddr, &phdr[i].p_paddr);
+	}
+
+	/* Now process sections searching for boot page tables
+	 * Start by finding the symtab section */
+	if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0)
+		err(1, "Seeking to section headers");
+	if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec))
+		err(1, "Reading section headers");
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (sec[i].sh_type == SHT_SYMTAB) {
+			int ret = 0;
+			syms = malloc(sec[i].sh_size);
+			if (!syms)
+				err(1,"Not enough memory for symbol table");
+			ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to symbol table");
+			ret = read(elf_fd, syms, sec[i].sh_size);
+			if (ret != sec[i].sh_size)
+				err(1, "Reading symbol table");
+			nsyms = sec[i].sh_size / sizeof(Elf64_Sym);
+
+
+			/* symtab links to strtab. We use it to find symbol
+			 * names */
+			strtab = malloc(sec[sec[i].sh_link].sh_size);
+			if (!strtab)
+				err(1,"Not enough memory for string table");
+			ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to string table");
+			ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size);
+			if (ret != sec[sec[i].sh_link].sh_size)
+				err(1, "Reading string table");
+			break;
+		}
+	}
+
+	/* We now have a pointer to the symtab, start searching for the symbol */
+	for (i = 0; i < nsyms; i++) {
+		if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name)
+			continue;
+		if (!strcmp("boot_level4_pgt",
+				(char *)((u64)syms[i].st_name + strtab))) {
+			*pgdir_addr = syms[i].st_value - *page_offset;
+			break;
+		}
+	}
+
+	if (!*pgdir_addr)
+		err(1,"Unable to find boot pgdir");
+
+	*ird_size = load_initrd(initrd, mem);
+
+	/* Entry is physical address: convert to virtual */
+	printf("entry=%lx page_offset=%lx  entry+page_offset=%lx\n",
+	       ehdr->e_entry, *page_offset, ehdr->e_entry + *page_offset);
+	return ehdr->e_entry + *page_offset;
+}
+
+static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
+{
+	unsigned int i, possibilities[256];
+
+	for (i = 0; i + 4 < len; i++) {
+		/* mov 0xXXXXXXXX,%eax */
+		if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
+			return (unsigned long)img[i+4] << 24;
+	}
+	errx(1, "could not determine page offset");
+}
+
+static u64 bzimage(int fd, unsigned long mem, unsigned long *pgdir_addr,
+		   const char *initrd, unsigned long *ird_size,
+		   u64 *page_offset)
+{
+	gzFile f;
+	int ret, len = 0;
+	void *img = (void *)0x100000;
+
+	map_memory(mem);
+
+	f = gzdopen(fd, "rb");
+	if (gzdirect(f))
+		errx(1, "did not find correct gzip header");
+	while ((ret = gzread(f, img + len, 65536)) > 0)
+		len += ret;
+	if (ret < 0)
+		err(1, "reading image from bzImage");
+
+	verbose("Unpacked size %i addr %p\n", len, img);
+	*page_offset = intuit_page_offset(img, len);
+//	*pgdir_addr = finish(mem, page_offset, initrd, ird_size);
+
+	/* Entry is physical address: convert to virtual */
+	return (u64)img + *page_offset;
+}
+
+static u64 load_bzimage(int bzimage_fd, const Elf64_Ehdr *ehdr,
+			unsigned long mem, unsigned long *pgdir_addr,
+			const char *initrd, unsigned long *ird_size,
+			u64 *page_offset)
+{
+	unsigned char c;
+	int state = 0;
+
+	/* Just brute force it. */
+	while (read(bzimage_fd, &c, 1) == 1) {
+		switch (state) {
+		case 0:
+			if (c == 0x1F)
+				state++;
+			break;
+		case 1:
+			if (c == 0x8B)
+				state++;
+			else
+				state = 0;
+			break;
+		case 2 ... 8:
+			state++;
+			break;
+		case 9:
+			lseek(bzimage_fd, -10, SEEK_CUR);
+			if (c != 0x03) /* Compressed under UNIX. */
+				state = -1;
+			else
+				return bzimage(bzimage_fd, mem, pgdir_addr,
+					       initrd, ird_size, page_offset);
+		}
+	}
+	errx(1, "Could not find kernel in bzImage");
+}
+
+static void *map_pages(unsigned long addr, unsigned int num)
+{
+	if (mmap((void *)addr, getpagesize() * num,
+		 PROT_READ|PROT_WRITE|PROT_EXEC,
+		 MAP_FIXED|MAP_PRIVATE, zero_fd, 0) != (void *)addr)
+		err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
+	return (void *)addr;
+}
+
+static struct lguest_device_desc *
+get_dev_entry(struct lguest_device_desc *descs, u16 type, u16 num_pages)
+{
+	static unsigned long top = LGUEST_GUEST_TOP;
+	int i;
+	unsigned long pfn = 0;
+
+	if (num_pages) {
+		top -= num_pages*getpagesize();
+		map_pages(top, num_pages);
+		pfn = top / getpagesize();
+	}
+
+	for (i = 0; i < LGUEST_MAX_DEVICES; i++) {
+		if (!descs[i].type) {
+			descs[i].features = descs[i].status = 0;
+			descs[i].type = type;
+			descs[i].num_pages = num_pages;
+			descs[i].pfn = pfn;
+			return &descs[i];
+		}
+	}
+	errx(1, "too many devices");
+}
+
+static void set_fd(int fd, struct devices *devices)
+{
+	FD_SET(fd, &devices->infds);
+	if (fd > devices->max_infd)
+		devices->max_infd = fd;
+}
+
+static struct device *new_device(struct devices *devices,
+				 struct lguest_device_desc *descs,
+				 u16 type, u16 num_pages,
+				 int fd,
+				 int (*handle_input)(int, struct device *),
+				 unsigned long watch_off,
+				 u64 (*handle_output)(int,
+						      const struct iovec *,
+						      unsigned,
+						      struct device *))
+{
+	struct device *dev = malloc(sizeof(*dev));
+
+	dev->next = devices->dev;
+	devices->dev = dev;
+
+	dev->fd = fd;
+	if (handle_input)
+		set_fd(dev->fd, devices);
+	dev->desc = get_dev_entry(descs, type, num_pages);
+	dev->mem = (void *)(dev->desc->pfn * getpagesize());
+	dev->handle_input = handle_input;
+	dev->watch_address = (unsigned long)dev->mem + watch_off;
+	dev->handle_output = handle_output;
+	return dev;
+}
+
+#define DEVNAME "/dev/lguest"
+
+static int tell_kernel(u64 pagelimit, u64 pgdir, u64 start, u64 page_offset)
+{
+	u64 args[] = { LHREQ_INITIALIZE,
+		       pagelimit, pgdir, start, page_offset };
+	int fd;
+
+	fd = open(DEVNAME, O_RDWR);
+	if (fd < 0)
+		err(1, "Opening %s", DEVNAME);
+
+	verbose("Telling kernel limit %lu, pgdir %li, e=%#08lx page_off=0x%08lx\n",
+		pagelimit, pgdir, start, page_offset);
+	if (write(fd, args, sizeof(args)) < 0)
+		err(1, "Writing to /dev/lguest");
+	return fd;
+}
+
+static void concat(char *dst, char *args[])
+{
+	unsigned int i, len = 0;
+
+	for (i = 0; args[i]; i++) {
+		strcpy(dst+len, args[i]);
+		strcat(dst+len, " ");
+		len += strlen(args[i]) + 1;
+	}
+	/* In case it's empty. */
+	dst[len] = '\0';
+}
+
+static void *_check_pointer(unsigned long addr, unsigned int size,
+			    unsigned int line)
+{
+	if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP)
+		errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
+	return (void *)addr;
+}
+#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
+
+/* Returns pointer to dma->used_len */
+static u64 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+{
+	unsigned int i;
+	struct lguest_dma *udma;
+
+	/* No buffers? */
+	if (dma == 0) {
+		printf("no buffers\n");
+		return NULL;
+	}
+
+	udma = check_pointer(dma, sizeof(*udma));
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (!udma->len[i])
+			break;
+
+		iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
+		iov[i].iov_len = udma->len[i];
+	}
+	*num = i;
+	return &udma->used_len;
+}
+
+static u64 *get_dma_buffer(int fd, void *addr,
+			   struct iovec iov[], unsigned *num, u32 *irq)
+{
+	u64 buf[] = { LHREQ_GETDMA, (u64)addr };
+	unsigned long udma;
+	u64 *res;
+
+	udma = write(fd, buf, sizeof(buf));
+	if (udma == (unsigned long)-1)
+		return NULL;
+
+	/* Kernel stashes irq in ->used_len. */
+	res = dma2iov(udma, iov, num);
+	if (res)
+		*irq = *res;
+	return res;
+}
+
+static void trigger_irq(int fd, u32 irq)
+{
+	u64 buf[] = { LHREQ_IRQ, irq };
+	if (write(fd, buf, sizeof(buf)) != 0)
+		err(1, "Triggering irq %i", irq);
+}
+
+static struct termios orig_term;
+static void restore_term(void)
+{
+	tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
+}
+
+struct console_abort
+{
+	int count;
+	struct timeval start;
+};
+
+/* We DMA input to buffer bound at start of console page. */
+static int handle_console_input(int fd, struct device *dev)
+{
+	u32 num, irq = 0;
+	u64 *lenp;
+	int len;
+	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+	struct console_abort *abort = dev->priv;
+
+	lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
+	if (!lenp) {
+		warn("console: no dma buffer!");
+		iov[0] = discard_iov;
+		num = 1;
+	}
+
+	len = readv(dev->fd, iov, num);
+	if (len <= 0) {
+		warnx("Failed to get console input, ignoring console.");
+		len = 0;
+	}
+
+	if (lenp) {
+		*lenp = len;
+		trigger_irq(fd, irq);
+	}
+
+	/* Three ^C within one second?  Exit. */
+	if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
+		if (!abort->count++)
+			gettimeofday(&abort->start, NULL);
+		else if (abort->count == 3) {
+			struct timeval now;
+			gettimeofday(&now, NULL);
+			if (now.tv_sec <= abort->start.tv_sec+1)
+				exit(2);
+			abort->count = 0;
+		}
+	} else
+		abort->count = 0;
+
+	if (!len) {
+		restore_term();
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned long peer_offset(unsigned int peernum)
+{
+	return 4 * peernum;
+}
+
+static u64 handle_tun_output(int fd, const struct iovec *iov,
+			     unsigned num, struct device *dev)
+{
+	/* Now we've seen output, we should warn if we can't get buffers. */
+	*(bool *)dev->priv = true;
+	return writev(dev->fd, iov, num);
+}
+
+static u64 handle_block_output(int fd, const struct iovec *iov,
+			       unsigned num, struct device *dev)
+{
+	struct lguest_block_page *p = dev->mem;
+	u32 irq, reply_num;
+	u64 *lenp;
+	int len;
+	struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
+	off64_t device_len, off = (off64_t)p->sector * 512;
+
+	device_len = *(off64_t *)dev->priv;
+
+	if (off >= device_len)
+		err(1, "Bad offset %lu vs %lu", off, device_len);
+	if (lseek64(dev->fd, off, SEEK_SET) != off)
+		err(1, "Bad seek to sector %i", p->sector);
+
+	verbose("Block: %s at offset %lu\n", p->type ? "WRITE" : "READ", off);
+
+	lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
+	if (!lenp)
+		err(1, "Block request didn't give us a dma buffer");
+
+	if (p->type) {
+		len = writev(dev->fd, iov, num);
+		if (off + len > device_len) {
+			ftruncate(dev->fd, device_len);
+			errx(1, "Write past end %lu+%u", off, len);
+		}
+		*lenp = 0;
+	} else {
+		len = readv(dev->fd, reply, reply_num);
+		*lenp = len;
+	}
+
+	p->result = 1 + (p->bytes != len);
+	trigger_irq(fd, irq);
+	return 0;
+}
+
+#define HIPQUAD(ip)				\
+	((u8)(ip >> 24)),			\
+	((u8)(ip >> 16)),			\
+	((u8)(ip >> 8)),			\
+	((u8)(ip))
+
+static void configure_device(const char *devname, u64 ipaddr,
+			     unsigned char hwaddr[6])
+{
+	struct ifreq ifr;
+	int fd;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+	memset(&ifr, 0, sizeof(ifr));
+	strcpy(ifr.ifr_name, devname);
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = htonl(ipaddr);
+	fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	if (fd < 0)
+		err(1, "opening IP socket");
+	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+		err(1, "Setting %s interface address", devname);
+	ifr.ifr_flags = IFF_UP;
+	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+		err(1, "Bringing interface %s up", devname);
+
+	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+		err(1, "getting hw address for %s", devname);
+
+	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
+}
+
+/* We send lguest_add signals while input is pending: avoids races. */
+static void wake_parent(int pipefd, struct devices *devices)
+{
+	int parent = getppid();
+	nice(19);
+
+	set_fd(pipefd, devices);
+
+	for (;;) {
+		fd_set rfds = devices->infds;
+
+		select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
+		if (FD_ISSET(pipefd, &rfds)) {
+			int ignorefd;
+			if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
+				exit(0);
+			FD_CLR(ignorefd, &devices->infds);
+		}
+		kill(parent, SIGUSR1);
+	}
+}
+
+/* We don't want signal to kill us, just jerk us out of kernel. */
+static void wakeup(int signo)
+{
+}
+
+static int handle_tun_input(int fd, struct device *dev)
+{
+	u32 irq = 0, num;
+	u64 *lenp;
+	int len;
+	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+
+	lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
+			      &irq);
+	if (!lenp) {
+		if (*(bool *)dev->priv)
+			warn("network: no dma buffer!");
+		iov[0] = discard_iov;
+		num = 1;
+	}
+
+	len = readv(dev->fd, iov, num);
+	if (len <= 0)
+		err(1, "reading network");
+	if (lenp) {
+		*lenp = len;
+		trigger_irq(fd, irq);
+	}
+	verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
+		((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
+		lenp ? "sent" : "discarded");
+	return 1;
+}
+
+/* We use fnctl locks to reserve network slots (autocleanup!) */
+static unsigned int find_slot(int netfd, const char *filename)
+{
+	struct flock fl;
+
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_len = 1;
+	for (fl.l_start = 0;
+	     fl.l_start < getpagesize()/sizeof(struct lguest_net);
+	     fl.l_start++) {
+		if (fcntl(netfd, F_SETLK, &fl) == 0)
+			return fl.l_start;
+	}
+	errx(1, "No free slots in network file %s", filename);
+}
+
+static void setup_net_file(const char *filename,
+			   struct lguest_device_desc *descs,
+			   struct devices *devices)
+{
+	int netfd;
+	struct device *dev;
+
+	netfd = open(filename, O_RDWR, 0);
+	if (netfd < 0) {
+		if (errno == ENOENT) {
+			netfd = open(filename, O_RDWR|O_CREAT, 0600);
+			if (netfd >= 0) {
+				char page[getpagesize()];
+				/* 0xFFFF == NO_GUEST */
+				memset(page, 0xFF, sizeof(page));
+				write(netfd, page, sizeof(page));
+			}
+		}
+		if (netfd < 0)
+			err(1, "cannot open net file '%s'", filename);
+	}
+
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_NET, 1,
+			 -1, NULL, 0, NULL);
+
+	/* This is the slot for the guest to use. */
+	dev->desc->features = find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM;
+	/* We overwrite the /dev/zero mapping with the actual file. */
+	if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
+			 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
+			err(1, "could not mmap '%s'", filename);
+	verbose("device %p@%p: shared net %s, peer %i\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()), filename,
+		dev->desc->features & ~LGUEST_NET_F_NOCSUM);
+}
+
+static u64 str2ip(const char *ipaddr)
+{
+	unsigned int byte[4];
+
+	sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
+	return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
+}
+
+static void setup_tun_net(const char *ipaddr,
+			  struct lguest_device_desc *descs,
+			  struct devices *devices)
+{
+	struct device *dev;
+	struct ifreq ifr;
+	int netfd;
+
+	netfd = open("/dev/net/tun", O_RDWR);
+	if (netfd < 0)
+		err(1, "opening /dev/net/tun");
+
+	memset(&ifr, 0, sizeof(ifr));
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+	strcpy(ifr.ifr_name, "tap%d");
+	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
+		err(1, "configuring /dev/net/tun");
+
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_NET, 1,
+			 netfd, handle_tun_input,
+			 peer_offset(0), handle_tun_output);
+	dev->priv = malloc(sizeof(bool));
+	*(bool *)dev->priv = false;
+
+	/* We are peer 0, rest is all NO_GUEST */
+	memset(dev->mem, 0xFF, getpagesize());
+	configure_device(ifr.ifr_name, str2ip(ipaddr), dev->mem);
+
+	/* You will be peer 1: we should create enough jitter to randomize */
+	dev->desc->features = NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS;
+	verbose("device %p@%p: tun net %u.%u.%u.%u\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()),
+		HIPQUAD(str2ip(ipaddr)));
+}
+
+static void setup_block_file(const char *filename,
+			     struct lguest_device_desc *descs,
+			     struct devices *devices)
+{
+	int fd;
+	struct device *dev;
+	off64_t *blocksize;
+	struct lguest_block_page *p;
+
+	fd = open(filename, O_RDWR|O_LARGEFILE|O_DIRECT, 0);
+	if (fd < 0)
+		err(1, "Opening %s", filename);
+
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_BLOCK, 1,
+			 fd, NULL, 0, handle_block_output);
+	dev->desc->features = LGUEST_DEVICE_F_RANDOMNESS;
+	blocksize = dev->priv = malloc(sizeof(*blocksize));
+	*blocksize = lseek64(fd, 0, SEEK_END);
+	p = dev->mem;
+
+	p->num_sectors = *blocksize/512;
+	verbose("device %p@%p: block %i sectors\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
+}
+
+static u64 handle_console_output(int fd, const struct iovec *iov,
+				 unsigned num, struct device*dev)
+{
+	return writev(STDOUT_FILENO, iov, num);
+}
+
+static void setup_console(struct lguest_device_desc *descs,
+			  struct devices *devices)
+{
+	struct device *dev;
+
+	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
+		struct termios term = orig_term;
+		term.c_lflag &= ~(ISIG|ICANON|ECHO);
+		tcsetattr(STDIN_FILENO, TCSANOW, &term);
+		atexit(restore_term);
+	}
+
+	/* We don't currently require a page for the console. */
+	dev = new_device(devices, descs, LGUEST_DEVICE_T_CONSOLE, 0,
+			 STDIN_FILENO, handle_console_input,
+			 4, handle_console_output);
+	dev->priv = malloc(sizeof(struct console_abort));
+	((struct console_abort *)dev->priv)->count = 0;
+	verbose("device %p@%p: console\n", dev->desc,
+		(void *)(dev->desc->pfn * getpagesize()));
+}
+
+static const char *get_arg(const char *arg, const char *prefix)
+{
+	if (strncmp(arg, prefix, strlen(prefix)) == 0)
+		return arg + strlen(prefix);
+	return NULL;
+}
+
+static u32 handle_device(int fd, unsigned long dma, unsigned long addr,
+			 struct devices *devices)
+{
+	struct device *i;
+	u64 *lenp;
+	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+	unsigned num = 0;
+
+	lenp = dma2iov(dma, iov, &num);
+	if (!lenp)
+		errx(1, "Bad SEND_DMA %li for address %#lx\n", dma, addr);
+
+	for (i = devices->dev; i; i = i->next) {
+		if (i->handle_output && addr == i->watch_address) {
+			*lenp = i->handle_output(fd, iov, num, i);
+			return 0;
+		}
+	}
+	warnx("Pending dma %p, addr %p", (void *)dma, (void *)addr);
+	return 0;
+}
+
+static void handle_input(int fd, int childfd, struct devices *devices)
+{
+	struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
+
+	for (;;) {
+		struct device *i;
+		fd_set fds = devices->infds;
+
+		if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
+			break;
+
+		for (i = devices->dev; i; i = i->next) {
+			if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+				if (!i->handle_input(fd, i)) {
+					FD_CLR(i->fd, &devices->infds);
+					/* Tell child to ignore it too... */
+					write(childfd, &i->fd, sizeof(i->fd));
+				}
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long mem, pgdir, entry, initrd_size, page_offset;
+	int arg, kern_fd, fd, child, pipefd[2];
+	Elf64_Ehdr hdr;
+	struct sigaction act;
+	sigset_t sigset;
+	struct lguest_device_desc *devdescs;
+	struct devices devices;
+	struct lguest_boot_info *boot = (void *)0;
+	const char *initrd_name = NULL;
+	u64 (*load)(int, const Elf64_Ehdr *ehdr, unsigned long,
+		    unsigned long *, const char *, unsigned long *,
+		    u64 *);
+
+	if (argv[1] && strcmp(argv[1], "--verbose") == 0) {
+		verbose = true;
+		argv++;
+		argc--;
+	}
+
+	if (argc < 3)
+		errx(1, "Usage: lguest [--verbose] <mem> vmlinux "
+			"[--sharenet=<filename>|--tunnet=<ipaddr>|--block=<filename>"
+			"|--initrd=<filename>]... [args...]");
+
+	zero_fd = open("/dev/zero", O_RDONLY, 0);
+	if (zero_fd < 0)
+		err(1, "Opening /dev/zero");
+
+	mem = memparse(argv[1]);
+	kern_fd = open(argv[2], O_RDONLY, 0);
+	if (kern_fd < 0)
+		err(1, "Opening %s", argv[2]);
+
+	if (read(kern_fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+		err(1, "Reading %s elf header", argv[2]);
+
+	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
+		load = map_elf;
+	else
+		load = load_bzimage;
+
+	devices.max_infd = -1;
+	devices.dev = NULL;
+	FD_ZERO(&devices.infds);
+
+	devdescs = map_pages(mem, 1);
+	arg = 3;
+	while (argv[arg] && argv[arg][0] == '-') {
+		const char *argval;
+
+		if ((argval = get_arg(argv[arg], "--sharenet=")) != NULL)
+			setup_net_file(argval, devdescs, &devices);
+		else if ((argval = get_arg(argv[arg], "--tunnet=")) != NULL)
+			setup_tun_net(argval, devdescs, &devices);
+		else if ((argval = get_arg(argv[arg], "--block=")) != NULL)
+			setup_block_file(argval, devdescs, &devices);
+		else if ((argval = get_arg(argv[arg], "--initrd=")) != NULL)
+			initrd_name = argval;
+		else
+			errx(1, "unknown arg '%s'", argv[arg]);
+		arg++;
+	}
+
+	entry = load(kern_fd, &hdr, mem, &pgdir, initrd_name, &initrd_size,
+		     &page_offset);
+	setup_console(devdescs, &devices);
+
+	concat(boot->cmdline, argv+arg);
+	boot->max_pfn = mem/getpagesize();
+	boot->initrd_size = initrd_size;
+
+	act.sa_handler = wakeup;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	sigaction(SIGUSR1, &act, NULL);
+
+	pipe(pipefd);
+	child = fork();
+	if (child == -1)
+		err(1, "forking");
+
+	if (child == 0) {
+		close(pipefd[1]);
+		wake_parent(pipefd[0], &devices);
+	}
+	close(pipefd[0]);
+
+	sigemptyset(&sigset);
+	sigaddset(&sigset, SIGUSR1);
+	sigprocmask(SIG_BLOCK, &sigset, NULL);
+
+	/* LGUEST_GUEST_TOP defined in Makefile, just below us. */
+	fd = tell_kernel(LGUEST_GUEST_TOP/getpagesize(),
+			 pgdir, entry, page_offset);
+
+	for (;;) {
+		unsigned long arr[2];
+		int readval;
+
+		sigprocmask(SIG_UNBLOCK, &sigset, NULL);
+		readval = read(fd, arr, sizeof(arr));
+		sigprocmask(SIG_BLOCK, &sigset, NULL);
+
+		switch (readval) {
+		case sizeof(arr):
+			handle_device(fd, arr[0], arr[1], &devices);
+			break;
+		case -1:
+			if (errno == EINTR)
+				break;
+		default:
+			if (errno == ENOENT) {
+				char reason[1024];
+				if (read(fd, reason, sizeof(reason)) > 0)
+					errx(1, "%s", reason);
+			}
+			err(1, "Running guest failed");
+		}
+		handle_input(fd, pipefd[1], &devices);
+	}
+}
Index: work-pv/Documentation/lguest/Makefile
===================================================================
--- work-pv.orig/Documentation/lguest/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# This creates the demonstration utility "lguest" which runs a Linux guest.
-
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
-include ../../.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
-
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
-	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
-LDLIBS:=-lz
-
-all: lguest.lds lguest
-
-# The linker script on x86 is so complex the only way of creating one
-# which will link our binary in the right place is to mangle the
-# default one.
-lguest.lds:
-	$(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
-
-clean:
-	rm -f lguest.lds lguest
Index: work-pv/Documentation/lguest/lguest.c
===================================================================
--- work-pv.orig/Documentation/lguest/lguest.c
+++ /dev/null
@@ -1,1039 +0,0 @@
-/* Simple program to layout "physical" memory for new lguest guest.
- * Linked high to avoid likely physical memory.  */
-#define _LARGEFILE64_SOURCE
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <err.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <elf.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <fcntl.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <signal.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <time.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <linux/if_tun.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <zlib.h>
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-
-#include "../../include/asm/lguest_user.h"
-
-#define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
-#define NET_PEERNUM 1
-#define BRIDGE_PFX "bridge:"
-
-static bool verbose;
-#define verbose(args...) \
-	do { if (verbose) printf(args); fflush(stdout); } while(0)
-
-struct devices
-{
-	fd_set infds;
-	int max_infd;
-
-	struct device *dev;
-};
-
-struct device
-{
-	struct device *next;
-	struct lguest_device_desc *desc;
-	void *mem;
-
-	/* Watch this fd if handle_input non-NULL. */
-	int fd;
-	int (*handle_input)(int fd, struct device *me);
-
-	/* Watch DMA to this address if handle_input non-NULL. */
-	unsigned long watch_address;
-	u32 (*handle_output)(int fd, const struct iovec *iov,
-			     unsigned int num, struct device *me);
-
-	/* Device-specific data. */
-	void *priv;
-};
-
-static char buf[1024];
-static struct iovec discard_iov = { .iov_base=buf, .iov_len=sizeof(buf) };
-static int zero_fd;
-
-/* LGUEST_GUEST_TOP defined in Makefile, just below us.
-   FIXME: vdso gets mapped just under it, and we need to protect that. */
-#define RESERVE_TOP LGUEST_GUEST_TOP - 1024*1024
-
-static u32 memparse(const char *ptr)
-{
-	char *end;
-	unsigned long ret = strtoul(ptr, &end, 0);
-
-	switch (*end) {
-	case 'G':
-	case 'g':
-		ret <<= 10;
-	case 'M':
-	case 'm':
-		ret <<= 10;
-	case 'K':
-	case 'k':
-		ret <<= 10;
-		end++;
-	default:
-		break;
-	}
-	return ret;
-}
-
-static inline unsigned long page_align(unsigned long addr)
-{
-	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
-}
-
-/* initrd gets loaded at top of memory: return length. */
-static unsigned long load_initrd(const char *name, unsigned long end)
-{
-	int ifd;
-	struct stat st;
-	void *iaddr;
-
-	if (!name)
-		return 0;
-
-	ifd = open(name, O_RDONLY, 0);
-	if (ifd < 0)
-		err(1, "Opening initrd '%s'", name);
-		
-	if (fstat(ifd, &st) < 0)
-		err(1, "fstat() on initrd '%s'", name);
-
-	iaddr = mmap((void *)end - st.st_size, st.st_size,
-		     PROT_READ|PROT_EXEC|PROT_WRITE,
-		     MAP_FIXED|MAP_PRIVATE, ifd, 0);
-	if (iaddr != (void *)end - st.st_size)
-		err(1, "Mmaping initrd '%s' returned %p not %p",
-		    name, iaddr, (void *)end - st.st_size);
-	close(ifd);
-	verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
-	return st.st_size;
-}
-
-/* First map /dev/zero over entire memory, then insert kernel. */
-static void map_memory(unsigned long mem)
-{
-	if (mmap(0, mem,
-		 PROT_READ|PROT_WRITE|PROT_EXEC,
-		 MAP_FIXED|MAP_PRIVATE, zero_fd, 0) != (void *)0)
-		err(1, "Mmaping /dev/zero for %li bytes", mem);
-}
-
-static u32 finish(unsigned long mem, unsigned long *page_offset,
-		  const char *initrd, unsigned long *ird_size)
-{
-	u32 *pgdir = NULL, *linear = NULL;
-	int i, pte_pages;
-
-	/* This is a top of mem. */
-	*ird_size = load_initrd(initrd, mem);
-
-	/* Below initrd is used as top level of pagetable. */
-	pte_pages = 1 + (mem/getpagesize() + 1023)/1024;
-
-	pgdir = (u32 *)page_align(mem - *ird_size - pte_pages*getpagesize());
-	linear = (void *)pgdir + getpagesize();
-
-	/* Linear map all of memory at page_offset (to top of mem). */
-	if (mem > -*page_offset)
-		mem = -*page_offset;
-
-	for (i = 0; i < mem / getpagesize(); i++)
-		linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
-	verbose("Linear %p-%p (%i-%i) = %#08x-%#08x\n",
-		linear, linear+i-1, 0, i-1, linear[0], linear[i-1]);
-
-	/* Now set up pgd so that this memory is at page_offset */
-	for (i = 0; i < mem / getpagesize(); i += getpagesize()/sizeof(u32)) {
-		pgdir[(i + *page_offset/getpagesize())/1024] 
-			= (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
-		verbose("Top level %lu = %#08x\n",
-			(i + *page_offset/getpagesize())/1024,
-			pgdir[(i + *page_offset/getpagesize())/1024]);
-	}
-
-	return (unsigned long)pgdir;
-}
-
-/* Returns the entry point */
-static u32 map_elf(int elf_fd, const Elf32_Ehdr *ehdr, unsigned long mem,
-		   unsigned long *pgdir_addr,
-		   const char *initrd, unsigned long *ird_size,
-		   unsigned long *page_offset)
-{
-	void *addr;
-	Elf32_Phdr phdr[ehdr->e_phnum];
-	unsigned int i;
-
-	/* Sanity checks. */
-	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
-		errx(1, "Malformed elf header");
-
-	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
-		err(1, "Seeking to program headers");
-	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
-		err(1, "Reading program headers");
-
-	map_memory(mem);
-
-	*page_offset = 0;
-	/* We map the loadable segments at virtual addresses corresponding
-	 * to their physical addresses (our virtual == guest physical). */
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		if (phdr[i].p_type != PT_LOAD)
-			continue;
-
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-		/* We map everything private, writable. */
-		if (phdr[i].p_paddr + phdr[i].p_memsz > mem)
-			errx(1, "Segment %i overlaps end of memory", i);
-
-		/* We expect linear address space. */
-		if (!*page_offset)
-			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
-		else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
-			errx(1, "Page offset of section %i different", i);
-
-		/* Recent ld versions don't page align any more. */
-		if (phdr[i].p_paddr % getpagesize()) {
-			phdr[i].p_filesz += (phdr[i].p_paddr % getpagesize());
-			phdr[i].p_offset -= (phdr[i].p_paddr % getpagesize());
-			phdr[i].p_paddr -= (phdr[i].p_paddr % getpagesize());
-		}
-		addr = mmap((void *)phdr[i].p_paddr,
-			    phdr[i].p_filesz,
-			    PROT_READ|PROT_WRITE|PROT_EXEC,
-			    MAP_FIXED|MAP_PRIVATE,
-			    elf_fd, phdr[i].p_offset);
-		if (addr != (void *)phdr[i].p_paddr)
-			err(1, "Mmaping vmlinux segment %i returned %p not %p (%p)",
-			    i, addr, (void *)phdr[i].p_paddr, &phdr[i].p_paddr);
-	}
-
-	*pgdir_addr = finish(mem, page_offset, initrd, ird_size);
-	/* Entry is physical address: convert to virtual */
-	return ehdr->e_entry + *page_offset;
-}
-
-static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
-{
-	unsigned int i, possibilities[256];
-
-	for (i = 0; i + 4 < len; i++) {
-		/* mov 0xXXXXXXXX,%eax */
-		if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
-			return (unsigned long)img[i+4] << 24;
-	}
-	errx(1, "could not determine page offset");
-}
-
-static u32 bzimage(int fd, unsigned long mem, unsigned long *pgdir_addr,
-		   const char *initrd, unsigned long *ird_size,
-		   unsigned long *page_offset)
-{
-	gzFile f;
-	int ret, len = 0;
-	void *img = (void *)0x100000;
-
-	map_memory(mem);
-
-	f = gzdopen(fd, "rb");
-	if (gzdirect(f))
-		errx(1, "did not find correct gzip header");
-	while ((ret = gzread(f, img + len, 65536)) > 0)
-		len += ret;
-	if (ret < 0)
-		err(1, "reading image from bzImage");
-
-	verbose("Unpacked size %i addr %p\n", len, img);
-	*page_offset = intuit_page_offset(img, len);
-	*pgdir_addr = finish(mem, page_offset, initrd, ird_size);
-
-	/* Entry is physical address: convert to virtual */
-	return (u32)img + *page_offset;
-}
-
-static u32 load_bzimage(int bzimage_fd, const Elf32_Ehdr *ehdr, 
-			unsigned long mem, unsigned long *pgdir_addr,
-			const char *initrd, unsigned long *ird_size,
-			unsigned long *page_offset)
-{
-	unsigned char c;
-	int state = 0;
-
-	/* Just brute force it. */
-	while (read(bzimage_fd, &c, 1) == 1) {
-		switch (state) {
-		case 0:
-			if (c == 0x1F)
-				state++;
-			break;
-		case 1:
-			if (c == 0x8B)
-				state++;
-			else
-				state = 0;
-			break;
-		case 2 ... 8:
-			state++;
-			break;
-		case 9:
-			lseek(bzimage_fd, -10, SEEK_CUR);
-			if (c != 0x03) /* Compressed under UNIX. */
-				state = -1;
-			else
-				return bzimage(bzimage_fd, mem, pgdir_addr,
-					       initrd, ird_size, page_offset);
-		}
-	}
-	errx(1, "Could not find kernel in bzImage");
-}
-
-static void *map_pages(unsigned long addr, unsigned int num)
-{
-	if (mmap((void *)addr, getpagesize() * num,
-		 PROT_READ|PROT_WRITE|PROT_EXEC,
-		 MAP_FIXED|MAP_PRIVATE, zero_fd, 0) != (void *)addr)
-		err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
-	return (void *)addr;
-}
-
-static struct lguest_device_desc *
-get_dev_entry(struct lguest_device_desc *descs, u16 type, u16 num_pages)
-{
-	static unsigned long top = RESERVE_TOP;
-	int i;
-	unsigned long pfn = 0;
-
-	if (num_pages) {
-		top -= num_pages*getpagesize();
-		map_pages(top, num_pages);
-		pfn = top / getpagesize();
-	}
-
-	for (i = 0; i < LGUEST_MAX_DEVICES; i++) {
-		if (!descs[i].type) {
-			descs[i].features = descs[i].status = 0;
-			descs[i].type = type;
-			descs[i].num_pages = num_pages;
-			descs[i].pfn = pfn;
-			return &descs[i];
-		}
-	}
-	errx(1, "too many devices");
-}
-
-static void set_fd(int fd, struct devices *devices)
-{
-	FD_SET(fd, &devices->infds);
-	if (fd > devices->max_infd)
-		devices->max_infd = fd;
-}
-
-static struct device *new_device(struct devices *devices,
-				 struct lguest_device_desc *descs,
-				 u16 type, u16 num_pages,
-				 int fd,
-				 int (*handle_input)(int, struct device *),
-				 unsigned long watch_off,
-				 u32 (*handle_output)(int,
-						      const struct iovec *,
-						      unsigned,
-						      struct device *))
-{
-	struct device *dev = malloc(sizeof(*dev));
-
-	dev->next = devices->dev;
-	devices->dev = dev;
-
-	dev->fd = fd;
-	if (handle_input)
-		set_fd(dev->fd, devices);
-	dev->desc = get_dev_entry(descs, type, num_pages);
-	dev->mem = (void *)(dev->desc->pfn * getpagesize());
-	dev->handle_input = handle_input;
-	dev->watch_address = (unsigned long)dev->mem + watch_off;
-	dev->handle_output = handle_output;
-	return dev;
-}
-
-static int tell_kernel(u32 pagelimit, u32 pgdir, u32 start, u32 page_offset)
-{
-	u32 args[] = { LHREQ_INITIALIZE,
-		       pagelimit, pgdir, start, page_offset };
-	int fd = open("/dev/lguest", O_RDWR);
-
-	if (fd < 0)
-		err(1, "Opening /dev/lguest");
-
-	verbose("Telling kernel limit %u, pgdir %i, e=%#08x page_off=0x%08x\n",
-		pagelimit, pgdir, start, page_offset);
-	if (write(fd, args, sizeof(args)) < 0)
-		err(1, "Writing to /dev/lguest");
-	return fd;
-}
-
-static void concat(char *dst, char *args[])
-{
-	unsigned int i, len = 0;
-
-	for (i = 0; args[i]; i++) {
-		strcpy(dst+len, args[i]);
-		strcat(dst+len, " ");
-		len += strlen(args[i]) + 1;
-	}
-	/* In case it's empty. */
-	dst[len] = '\0';
-}
-
-static void *_check_pointer(unsigned long addr, unsigned int size,
-			    unsigned int line)
-{
-	if (addr >= RESERVE_TOP || addr + size >= RESERVE_TOP)
-		errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
-	return (void *)addr;
-}
-#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
-
-/* Returns pointer to dma->used_len */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
-{
-	unsigned int i;
-	struct lguest_dma *udma;
-
-	/* No buffers? */
-	if (dma == 0) {
-		printf("no buffers\n");
-		return NULL;
-	}
-
-	udma = check_pointer(dma, sizeof(*udma));
-	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
-		if (!udma->len[i])
-			break;
-
-		iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
-		iov[i].iov_len = udma->len[i];
-	}
-	*num = i;
-	return &udma->used_len;
-}
-
-static u32 *get_dma_buffer(int fd, void *addr,
-			   struct iovec iov[], unsigned *num, u32 *irq)
-{
-	u32 buf[] = { LHREQ_GETDMA, (u32)addr };
-	unsigned long udma;
-	u32 *res;
-
-	udma = write(fd, buf, sizeof(buf));
-	if (udma == (unsigned long)-1)
-		return NULL;
-
-	/* Kernel stashes irq in ->used_len. */
-	res = dma2iov(udma, iov, num);
-	if (res)
-		*irq = *res;
-	return res;
-}
-
-static void trigger_irq(int fd, u32 irq)
-{
-	u32 buf[] = { LHREQ_IRQ, irq };
-	if (write(fd, buf, sizeof(buf)) != 0)
-		err(1, "Triggering irq %i", irq);
-}
-
-static struct termios orig_term;
-static void restore_term(void)
-{
-	tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-struct console_abort
-{
-	int count;
-	struct timeval start;
-};
-
-/* We DMA input to buffer bound at start of console page. */
-static int handle_console_input(int fd, struct device *dev)
-{
-	u32 num, irq = 0, *lenp;
-	int len;
-	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
-	struct console_abort *abort = dev->priv;
-
-	lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
-	if (!lenp) {
-		warn("console: no dma buffer!");
-		iov[0] = discard_iov;
-		num = 1;
-	}
-
-	len = readv(dev->fd, iov, num);
-	if (len <= 0) {
-		warnx("Failed to get console input, ignoring console.");
-		len = 0;
-	}
-
-	if (lenp) {
-		*lenp = len;
-		trigger_irq(fd, irq);
-	}
-
-	/* Three ^C within one second?  Exit. */
-	if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
-		if (!abort->count++)
-			gettimeofday(&abort->start, NULL);
-		else if (abort->count == 3) {
-			struct timeval now;
-			gettimeofday(&now, NULL);
-			if (now.tv_sec <= abort->start.tv_sec+1)
-				exit(2);
-			abort->count = 0;
-		}
-	} else
-		abort->count = 0;
-
-	if (!len) {
-		restore_term();
-		return 0;
-	}
-	return 1;
-}
-
-static unsigned long peer_offset(unsigned int peernum)
-{
-	return 4 * peernum;
-}
-
-static u32 handle_tun_output(int fd, const struct iovec *iov,
-			     unsigned num, struct device *dev)
-{
-	/* Now we've seen output, we should warn if we can't get buffers. */
-	*(bool *)dev->priv = true;
-	return writev(dev->fd, iov, num);
-}
-
-static u32 handle_block_output(int fd, const struct iovec *iov,
-			       unsigned num, struct device *dev)
-{
-	struct lguest_block_page *p = dev->mem;
-	u32 irq, reply_num, *lenp;
-	int len;
-	struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
-	off64_t device_len, off = (off64_t)p->sector * 512;
-
-	device_len = *(off64_t *)dev->priv;
-
-	if (off >= device_len)
-		err(1, "Bad offset %llu vs %llu", off, device_len);
-	if (lseek64(dev->fd, off, SEEK_SET) != off)
-		err(1, "Bad seek to sector %i", p->sector);
-
-	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
-
-	lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
-	if (!lenp)
-		err(1, "Block request didn't give us a dma buffer");
-
-	if (p->type) {
-		len = writev(dev->fd, iov, num);
-		if (off + len > device_len) {
-			ftruncate(dev->fd, device_len);
-			errx(1, "Write past end %llu+%u", off, len);
-		}
-		*lenp = 0;
-	} else {
-		len = readv(dev->fd, reply, reply_num);
-		*lenp = len;
-	}
-
-	p->result = 1 + (p->bytes != len);
-	trigger_irq(fd, irq);
-	return 0;
-}
-
-#define HIPQUAD(ip)				\
-	((u8)(ip >> 24)),			\
-	((u8)(ip >> 16)),			\
-	((u8)(ip >> 8)),			\
-	((u8)(ip))
-
-static void configure_device(int fd, const char *devname, u32 ipaddr,
-			     unsigned char hwaddr[6])
-{
-	struct ifreq ifr;
-	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
-
-	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, devname);
-	sin->sin_family = AF_INET;
-	sin->sin_addr.s_addr = htonl(ipaddr);
-	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", devname);
-	ifr.ifr_flags = IFF_UP;
-	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", devname);
-
-	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
-		err(1, "getting hw address for %s", devname);
-
-	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
-}
-
-/* We send lguest_add signals while input is pending: avoids races. */
-static void wake_parent(int pipefd, struct devices *devices)
-{
-	int parent = getppid();
-	nice(19);
-
-	set_fd(pipefd, devices);
-
-	for (;;) {
-		fd_set rfds = devices->infds;
-
-		select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
-		if (FD_ISSET(pipefd, &rfds)) {
-			int ignorefd;
-			if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
-				exit(0);
-			FD_CLR(ignorefd, &devices->infds);
-		}
-		kill(parent, SIGUSR1);
-	}
-}
-
-/* We don't want signal to kill us, just jerk us out of kernel. */
-static void wakeup(int signo)
-{
-}
-
-static int handle_tun_input(int fd, struct device *dev)
-{
-	u32 irq = 0, num, *lenp;
-	int len;
-	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
-
-	lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
-			      &irq);
-	if (!lenp) {
-		if (*(bool *)dev->priv)
-			warn("network: no dma buffer!");
-		iov[0] = discard_iov;
-		num = 1;
-	}
-
-	len = readv(dev->fd, iov, num);
-	if (len <= 0)
-		err(1, "reading network");
-	if (lenp) {
-		*lenp = len;
-		trigger_irq(fd, irq);
-	}
-	verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
-		((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
-		lenp ? "sent" : "discarded");
-	return 1;
-}
-
-/* We use fnctl locks to reserve network slots (autocleanup!) */
-static unsigned int find_slot(int netfd, const char *filename)
-{
-	struct flock fl;
-
-	fl.l_type = F_WRLCK;
-	fl.l_whence = SEEK_SET;
-	fl.l_len = 1;
-	for (fl.l_start = 0;
-	     fl.l_start < getpagesize()/sizeof(struct lguest_net);
-	     fl.l_start++) {
-		if (fcntl(netfd, F_SETLK, &fl) == 0)
-			return fl.l_start;
-	}
-	errx(1, "No free slots in network file %s", filename);
-}
-
-static void setup_net_file(const char *filename,
-			   struct lguest_device_desc *descs,
-			   struct devices *devices)
-{
-	int netfd;
-	struct device *dev;
-
-	netfd = open(filename, O_RDWR, 0);
-	if (netfd < 0) {
-		if (errno == ENOENT) {
-			netfd = open(filename, O_RDWR|O_CREAT, 0600);
-			if (netfd >= 0) {
-				char page[getpagesize()];
-				/* 0xFFFF == NO_GUEST */
-				memset(page, 0xFF, sizeof(page));
-				write(netfd, page, sizeof(page));
-			}
-		}
-		if (netfd < 0)
-			err(1, "cannot open net file '%s'", filename);
-	}
-
-	dev = new_device(devices, descs, LGUEST_DEVICE_T_NET, 1,
-			 -1, NULL, 0, NULL);
-
-	/* This is the slot for the guest to use. */
-	dev->desc->features = find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM;
-	/* We overwrite the /dev/zero mapping with the actual file. */
-	if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
-			 MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
-			err(1, "could not mmap '%s'", filename);
-	verbose("device %p@%p: shared net %s, peer %i\n", dev->desc, 
-		(void *)(dev->desc->pfn * getpagesize()), filename, 
-		dev->desc->features & ~LGUEST_NET_F_NOCSUM);
-}
-
-static u32 str2ip(const char *ipaddr)
-{
-	unsigned int byte[4];
-
-	sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
-	return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
-}
-
-/* adapted from libbridge */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-	int r, ifidx;
-	struct ifreq ifr;
-
-	if (!*br_name)
-		errx(1, "must specify bridge name");
-
-	ifidx = if_nametoindex(if_name);
-	if (!ifidx)
-		errx(1, "interface %s does not exist!\n", if_name);
-
-	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-	ifr.ifr_ifindex = ifidx;
-	r = ioctl(fd, SIOCBRADDIF, &ifr);
-	if (r != -1)
-		return;
-
-	switch (errno) {
-	case ENODEV:
-		errx(1, "bridge %s does not exist!\n", br_name);
-	case EBUSY:
-		errx(1, "device %s is already a member of a bridge; "
-			"can't enslave it to bridge %s.\n", if_name, br_name);
-	case ELOOP:
-		errx(1, "device %s is a bridge device itself; "
-			"can't enslave a bridge device to a bridge device.\n",
-			if_name);
-	default:
-		err(1, "can't add %s to bridge %s\n", if_name, br_name);
-	}
-}
-
-
-static void setup_tun_net(const char *arg,
-			  struct lguest_device_desc *descs,
-			  struct devices *devices)
-{
-	struct device *dev;
-	struct ifreq ifr;
-	int netfd, ipfd;
-	u32 ipaddr;
-	const char *br_name = NULL;
-
-	netfd = open("/dev/net/tun", O_RDWR);
-	if (netfd < 0)
-		err(1, "opening /dev/net/tun");
-
-	memset(&ifr, 0, sizeof(ifr));
-	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
-	strcpy(ifr.ifr_name, "tap%d");
-	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-		err(1, "configuring /dev/net/tun");
-
-	dev = new_device(devices, descs, LGUEST_DEVICE_T_NET, 1,
-			 netfd, handle_tun_input,
-			 peer_offset(0), handle_tun_output);
-	dev->priv = malloc(sizeof(bool));
-	*(bool *)dev->priv = false;
-
-	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (ipfd < 0)
-		err(1, "opening IP socket");
-
-	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		ipaddr = INADDR_ANY;
-		br_name = arg + strlen(BRIDGE_PFX);
-		add_to_bridge(ipfd, ifr.ifr_name, br_name);
-	} else
-		ipaddr = str2ip(arg);
-
-	/* We are peer 0, rest is all NO_GUEST */
-	configure_device(ipfd, ifr.ifr_name, ipaddr, dev->mem);
-	close (ipfd);
-
-	/* You will be peer 1: we should create enough jitter to randomize */
-	dev->desc->features = NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS;
-	verbose("device %p@%p: tun net %u.%u.%u.%u\n", dev->desc,
-		(void *)(dev->desc->pfn * getpagesize()),
-		HIPQUAD(ipaddr));
-	if (br_name)
-		verbose("attched to bridge: %s\n", br_name);
-}
-
-static void setup_block_file(const char *filename,
-			     struct lguest_device_desc *descs,
-			     struct devices *devices)
-{
-	int fd;
-	struct device *dev;
-	off64_t *blocksize;
-	struct lguest_block_page *p;
-
-	fd = open(filename, O_RDWR|O_LARGEFILE|O_DIRECT, 0);
-	if (fd < 0)
-		err(1, "Opening %s", filename);
-
-	dev = new_device(devices, descs, LGUEST_DEVICE_T_BLOCK, 1,
-			 fd, NULL, 0, handle_block_output);
-	dev->desc->features = LGUEST_DEVICE_F_RANDOMNESS;
-	blocksize = dev->priv = malloc(sizeof(*blocksize));
-	*blocksize = lseek64(fd, 0, SEEK_END);
-	p = dev->mem;
-
-	p->num_sectors = *blocksize/512;
-	verbose("device %p@%p: block %i sectors\n", dev->desc, 
-		(void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
-}
-
-static u32 handle_console_output(int fd, const struct iovec *iov,
-				 unsigned num, struct device*dev)
-{
-	return writev(STDOUT_FILENO, iov, num);
-}
-
-static void setup_console(struct lguest_device_desc *descs,
-			  struct devices *devices)
-{
-	struct device *dev;
-
-	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
-		struct termios term = orig_term;
-		term.c_lflag &= ~(ISIG|ICANON|ECHO);
-		tcsetattr(STDIN_FILENO, TCSANOW, &term);
-		atexit(restore_term);
-	}
-
-	/* We don't currently require a page for the console. */
-	dev = new_device(devices, descs, LGUEST_DEVICE_T_CONSOLE, 0,
-			 STDIN_FILENO, handle_console_input,
-			 4, handle_console_output);
-	dev->priv = malloc(sizeof(struct console_abort));
-	((struct console_abort *)dev->priv)->count = 0;
-	verbose("device %p@%p: console\n", dev->desc, 
-		(void *)(dev->desc->pfn * getpagesize()));
-}
-
-static const char *get_arg(const char *arg, const char *prefix)
-{
-	if (strncmp(arg, prefix, strlen(prefix)) == 0)
-		return arg + strlen(prefix);
-	return NULL;
-}
-
-static u32 handle_device(int fd, unsigned long dma, unsigned long addr,
-			 struct devices *devices)
-{
-	struct device *i;
-	u32 *lenp;
-	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
-	unsigned num = 0;
-
-	lenp = dma2iov(dma, iov, &num);
-	if (!lenp)
-		errx(1, "Bad SEND_DMA %li for address %#lx\n", dma, addr);
-
-	for (i = devices->dev; i; i = i->next) {
-		if (i->handle_output && addr == i->watch_address) {
-			*lenp = i->handle_output(fd, iov, num, i);
-			return 0;
-		}
-	}
-	warnx("Pending dma %p, addr %p", (void *)dma, (void *)addr);
-	return 0;
-}
-
-static void handle_input(int fd, int childfd, struct devices *devices)
-{
-	struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
-
-	for (;;) {
-		struct device *i;
-		fd_set fds = devices->infds;
-
-		if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
-			break;
-
-		for (i = devices->dev; i; i = i->next) {
-			if (i->handle_input && FD_ISSET(i->fd, &fds)) {
-				if (!i->handle_input(fd, i)) {
-					FD_CLR(i->fd, &devices->infds);
-					/* Tell child to ignore it too... */
-					write(childfd, &i->fd, sizeof(i->fd));
-				}
-			}
-		}
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned long mem, pgdir, entry, initrd_size, page_offset;
-	int arg, kern_fd, fd, child, pipefd[2];
-	Elf32_Ehdr hdr;
-	struct sigaction act;
-	sigset_t sigset;
-	struct lguest_device_desc *devdescs;
-	struct devices devices;
-	struct lguest_boot_info *boot = (void *)0;
-	const char *initrd_name = NULL;
-	u32 (*load)(int, const Elf32_Ehdr *ehdr, unsigned long,
-		    unsigned long *, const char *, unsigned long *,
-		    unsigned long *);
-
-	if (argv[1] && strcmp(argv[1], "--verbose") == 0) {
-		verbose = true;
-		argv++;
-		argc--;
-	}
-
-	if (argc < 4)
-		errx(1, "Usage: lguest [--verbose] <mem> vmlinux "
-			"[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)"
-			"|--block=<filename>|--initrd=<filename>]... [args...]");
-
-	zero_fd = open("/dev/zero", O_RDONLY, 0);
-	if (zero_fd < 0)
-		err(1, "Opening /dev/zero");
-
-	mem = memparse(argv[1]);
-	kern_fd = open(argv[2], O_RDONLY, 0);
-	if (kern_fd < 0)
-		err(1, "Opening %s", argv[2]);
-
-	if (read(kern_fd, &hdr, sizeof(hdr)) != sizeof(hdr))
-		err(1, "Reading %s elf header", argv[2]);
-
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		load = map_elf;
-	else
-		load = load_bzimage;
-
-	devices.max_infd = -1;
-	devices.dev = NULL;
-	FD_ZERO(&devices.infds);
-
-	devdescs = map_pages(mem, 1);
-	arg = 3;
-	while (argv[arg] && argv[arg][0] == '-') {
-		const char *argval;
-
-		if ((argval = get_arg(argv[arg], "--sharenet=")) != NULL)
-			setup_net_file(argval, devdescs, &devices);
-		else if ((argval = get_arg(argv[arg], "--tunnet=")) != NULL)
-			setup_tun_net(argval, devdescs, &devices);
-		else if ((argval = get_arg(argv[arg], "--block=")) != NULL)
-			setup_block_file(argval, devdescs, &devices);
-		else if ((argval = get_arg(argv[arg], "--initrd=")) != NULL)
-			initrd_name = argval;
-		else
-			errx(1, "unknown arg '%s'", argv[arg]);
-		arg++;
-	}
-
-	entry = load(kern_fd, &hdr, mem, &pgdir, initrd_name, &initrd_size,
-		     &page_offset);
-	setup_console(devdescs, &devices);
-
-	concat(boot->cmdline, argv+arg);
-	boot->max_pfn = mem/getpagesize();
-	boot->initrd_size = initrd_size;
-
-	act.sa_handler = wakeup;
-	sigemptyset(&act.sa_mask);
-	act.sa_flags = 0;
-	sigaction(SIGUSR1, &act, NULL);
-
-	pipe(pipefd);
-	child = fork();
-	if (child == -1)
-		err(1, "forking");
-
-	if (child == 0) {
-		close(pipefd[1]);
-		wake_parent(pipefd[0], &devices);
-	}
-	close(pipefd[0]);
-
-	sigemptyset(&sigset);
-	sigaddset(&sigset, SIGUSR1);
-	sigprocmask(SIG_BLOCK, &sigset, NULL);
-
-	fd = tell_kernel(RESERVE_TOP/getpagesize(), pgdir, entry, page_offset);
-
-	for (;;) {
-		unsigned long arr[2];
-		int readval;
-
-		sigprocmask(SIG_UNBLOCK, &sigset, NULL);
-		readval = read(fd, arr, sizeof(arr));
-		sigprocmask(SIG_BLOCK, &sigset, NULL);
-
-		switch (readval) {
-		case sizeof(arr):
-			handle_device(fd, arr[0], arr[1], &devices);
-			break;
-		case -1:
-			if (errno == EINTR)
-				break;
-		default:
-			if (errno == ENOENT) {
-				char reason[1024];
-				if (read(fd, reason, sizeof(reason)) > 0)
-					errx(1, "%s", reason);
-			}
-			err(1, "Running guest failed");
-		}
-		handle_input(fd, pipefd[1], &devices);
-	}
-}

--

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxx
https://lists.osdl.org/mailman/listinfo/virtualization


[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux