[PATCH] Unified lguest launcher

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a new version of the unified lguest launcher that applies to
the current tree. According to rusty's suggestion, I'm bothering less
to be able to load 32 bit kernels on 64-bit machines: changing the
launcher for such case would be the easy part! In the absence of
further objections, I'll commit it.

Signed-off-by: Glauber de Oliveira Costa <gcosta@xxxxxxxxxx>

--
Glauber de Oliveira Costa.
"Free as in Freedom"

"The less confident you are, the more serious you have to act."
Index: linux-2.6.20/Documentation/lguest/Makefile
===================================================================
--- linux-2.6.20.orig/Documentation/lguest/Makefile
+++ linux-2.6.20/Documentation/lguest/Makefile
@@ -1,12 +1,13 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
 
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+#we could uname -i, but it seems to return unknown on a bunch locations
+ARCH:=$(shell uname -m | sed s/i[3456]86/i386/)
+
 include ../../.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
+include $(ARCH)/defines
 
 CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
-	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds -I$(ARCH)
 LDLIBS:=-lz
 
 all: lguest.lds lguest
Index: linux-2.6.20/Documentation/lguest/i386/defines
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/i386/defines
@@ -0,0 +1,4 @@
+# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
+# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+include ../../.config
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
Index: linux-2.6.20/Documentation/lguest/i386/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/i386/lguest_defs.h
@@ -0,0 +1,9 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+/* LGUEST_TOP_ADDRESS comes from the Makefile */
+#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP - 1024*1024
+
+#include "../../../include/linux/lguest_launcher.h"
+
+#endif
Index: linux-2.6.20/Documentation/lguest/lguest.c
===================================================================
--- linux-2.6.20.orig/Documentation/lguest/lguest.c
+++ linux-2.6.20/Documentation/lguest/lguest.c
@@ -33,7 +33,7 @@
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
+#include <lguest_defs.h>
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
 #define NET_PEERNUM 1
@@ -64,7 +64,7 @@ struct device
 
 	/* Watch DMA to this key if handle_input non-NULL. */
 	unsigned long watch_key;
-	u32 (*handle_output)(int fd, const struct iovec *iov,
+	unsigned long (*handle_output)(int fd, const struct iovec *iov,
 			     unsigned int num, struct device *me);
 
 	/* Device-specific data. */
@@ -94,20 +94,29 @@ static void *map_zeroed_pages(unsigned l
 }
 
 /* Returns the entry point */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
+
+static unsigned long map_elf(int elf_fd, const void *hdr, 
 			     unsigned long *page_offset)
 {
-	void *addr;
+#ifndef __x86_64__
+	const Elf32_Ehdr *ehdr = hdr;
 	Elf32_Phdr phdr[ehdr->e_phnum];
+#else
+	const Elf64_Ehdr *ehdr = hdr;
+	Elf64_Phdr phdr[ehdr->e_phnum];
+#endif
+	void *addr;
 	unsigned int i;
 
 	/* Sanity checks. */
 	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+	    || ((ehdr->e_machine != EM_386) &&
+		(ehdr->e_machine != EM_X86_64))
+	    || ehdr->e_phentsize != sizeof(phdr[0])
+	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(phdr[0]))
 		errx(1, "Malformed elf header");
 
+
 	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
 		err(1, "Seeking to program headers");
 	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
@@ -120,13 +129,17 @@ static unsigned long map_elf(int elf_fd,
 		if (phdr[i].p_type != PT_LOAD)
 			continue;
 
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+		verbose("Section %i: size %lu addr %p\n",
+		i, (unsigned long)phdr[i].p_memsz, (void *)phdr[i].p_paddr);
 
 		/* We expect linear address space. */
 		if (!*page_offset)
 			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
-		else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+		else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+#ifdef __x86_64__
+			 && (phdr[i].p_vaddr != VSYSCALL_START)
+#endif
+			)
 			errx(1, "Page offset of section %i different", i);
 
 		/* We map everything private, writable. */
@@ -210,15 +223,18 @@ static unsigned long load_bzimage(int fd
 	errx(1, "Could not find kernel in bzImage");
 }
 
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+/* In x86_64 systems, page table hierarchy does not start at a common location,
+ * but rather, is indicated by a symbol called boot_level4_pgt. Due to this, we
+ * need the elf header to survive this function, to lookup for the pgdir */
+static unsigned long load_kernel(int fd, unsigned long *page_offset, void *ehdr)
 {
-	Elf32_Ehdr hdr;
+	Elf64_Ehdr *hdr = ehdr;
 
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+	if (read(fd, hdr, sizeof(*hdr)) != sizeof(*hdr))
 		err(1, "Reading kernel");
 
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr, page_offset);
+	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) == 0)
+		return map_elf(fd, ehdr, page_offset);
 
 	return load_bzimage(fd, page_offset);
 }
@@ -250,6 +266,7 @@ static inline unsigned long page_align(u
 	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
 }
 
+#ifndef __x86_64__
 static unsigned long setup_pagetables(unsigned long mem,
 				      unsigned long initrd_size,
 				      unsigned long page_offset)
@@ -285,6 +302,74 @@ static unsigned long setup_pagetables(un
 
 	return (unsigned long)pgdir;
 }
+#else
+static unsigned long find_pgt_symbol(int elf_fd, 
+				Elf64_Ehdr *ehdr, 
+				unsigned long page_offset)
+{
+	unsigned int i;
+	Elf64_Shdr sec[ehdr->e_shnum];
+	Elf64_Sym *syms;
+	char *strtab = NULL;
+	unsigned long pgdir_addr = 0;
+	unsigned long nsyms = 0;
+
+	/* Now process sections searching for boot page tables
+	 * Start by finding the symtab section */
+	if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0)
+		err(1, "Seeking to section headers");
+	if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec))
+		err(1, "Reading section headers");
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (sec[i].sh_type == SHT_SYMTAB) {
+			int ret = 0;
+			syms = malloc(sec[i].sh_size);
+			if (!syms)
+				err(1,"Not enough memory for symbol table");
+			ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to symbol table");
+			ret = read(elf_fd, syms, sec[i].sh_size);
+			if (ret != sec[i].sh_size)
+				err(1, "Reading symbol table");
+			nsyms = sec[i].sh_size / sizeof(Elf64_Sym);
+
+
+			/* symtab links to strtab. We use it to find symbol
+			 * names */
+			strtab = malloc(sec[sec[i].sh_link].sh_size);
+			if (!strtab)
+				err(1,"Not enough memory for string table");
+			ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to string table");
+			ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size);
+			if (ret != sec[sec[i].sh_link].sh_size)
+				err(1, "Reading string table");
+			break;
+		}
+	}
+
+	/* We now have a pointer to the symtab, start searching for the symbol */
+	for (i = 0; i < nsyms; i++) {
+		if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name)
+			continue;
+		if (!strcmp(BOOT_PGTABLE,
+				(char *)((u64)syms[i].st_name + strtab))) {
+			pgdir_addr = syms[i].st_value - page_offset;
+			break;
+		}
+	}
+
+	if (!pgdir_addr) {
+		errno = ESRCH; 
+		err(1,"Unable to find boot pgdir");
+	}
+
+	return pgdir_addr;
+}
+#endif
 
 static void concat(char *dst, char *args[])
 {
@@ -299,9 +384,10 @@ static void concat(char *dst, char *args
 	dst[len] = '\0';
 }
 
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+static int tell_kernel(unsigned long pgdir, unsigned long start, 
+						unsigned long page_offset)
 {
-	u32 args[] = { LHREQ_INITIALIZE,
+	unsigned long args[] = { LHREQ_INITIALIZE,
 		       LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
 		       pgdir, start, page_offset };
 	int fd;
@@ -379,7 +465,8 @@ static void *_check_pointer(unsigned lon
 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
 
 /* Returns pointer to dma->used_len */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+static unsigned long *dma2iov(unsigned long dma, struct iovec iov[], 
+								unsigned *num)
 {
 	unsigned int i;
 	struct lguest_dma *udma;
@@ -396,12 +483,12 @@ static u32 *dma2iov(unsigned long dma, s
 	return &udma->used_len;
 }
 
-static u32 *get_dma_buffer(int fd, void *key,
+static unsigned long *get_dma_buffer(int fd, void *key,
 			   struct iovec iov[], unsigned int *num, u32 *irq)
 {
-	u32 buf[] = { LHREQ_GETDMA, (u32)key };
+	unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)key };
 	unsigned long udma;
-	u32 *res;
+	unsigned long *res;
 
 	udma = write(fd, buf, sizeof(buf));
 	if (udma == (unsigned long)-1)
@@ -415,7 +502,7 @@ static u32 *get_dma_buffer(int fd, void 
 
 static void trigger_irq(int fd, u32 irq)
 {
-	u32 buf[] = { LHREQ_IRQ, irq };
+	unsigned long buf[] = { LHREQ_IRQ, irq };
 	if (write(fd, buf, sizeof(buf)) != 0)
 		err(1, "Triggering irq %i", irq);
 }
@@ -443,7 +530,8 @@ struct console_abort
 /* We DMA input to buffer bound at start of console page. */
 static bool handle_console_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0; 
+	unsigned long *lenp;
 	int len;
 	unsigned int num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -487,14 +575,14 @@ static bool handle_console_input(int fd,
 	return true;
 }
 
-static u32 handle_console_output(int fd, const struct iovec *iov,
-				 unsigned num, struct device*dev)
+static unsigned long  handle_console_output(int fd, const struct iovec *iov,
+					 unsigned num, struct device*dev)
 {
 	return writev(STDOUT_FILENO, iov, num);
 }
 
-static u32 handle_tun_output(int fd, const struct iovec *iov,
-			     unsigned num, struct device *dev)
+static unsigned long  handle_tun_output(int fd, const struct iovec *iov,
+					     unsigned num, struct device *dev)
 {
 	/* Now we've seen output, we should warn if we can't get buffers. */
 	*(bool *)dev->priv = true;
@@ -508,7 +596,8 @@ static unsigned long peer_offset(unsigne
 
 static bool handle_tun_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0; 
+	unsigned long *lenp;
 	int len;
 	unsigned num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -534,11 +623,12 @@ static bool handle_tun_input(int fd, str
 	return true;
 }
 
-static u32 handle_block_output(int fd, const struct iovec *iov,
-			       unsigned num, struct device *dev)
+static unsigned long handle_block_output(int fd, const struct iovec *iov,
+				       unsigned num, struct device *dev)
 {
 	struct lguest_block_page *p = dev->mem;
-	u32 irq, *lenp;
+	u32 irq; 
+	unsigned long *lenp;
 	unsigned int len, reply_num;
 	struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
 	off64_t device_len, off = (off64_t)p->sector * 512;
@@ -546,11 +636,14 @@ static u32 handle_block_output(int fd, c
 	device_len = *(off64_t *)dev->priv;
 
 	if (off >= device_len)
-		err(1, "Bad offset %llu vs %llu", off, device_len);
+		err(1, "Bad offset %llu vs %llu", 
+				(unsigned long long)off, 
+				(unsigned long long) device_len);
 	if (lseek64(dev->fd, off, SEEK_SET) != off)
 		err(1, "Bad seek to sector %i", p->sector);
 
-	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", 
+						(unsigned long long)off);
 
 	lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
 	if (!lenp)
@@ -560,7 +653,8 @@ static u32 handle_block_output(int fd, c
 		len = writev(dev->fd, iov, num);
 		if (off + len > device_len) {
 			ftruncate(dev->fd, device_len);
-			errx(1, "Write past end %llu+%u", off, len);
+			errx(1, "Write past end %llu+%u", 
+					(unsigned long long)off, len);
 		}
 		*lenp = 0;
 	} else {
@@ -577,7 +671,7 @@ static void handle_output(int fd, unsign
 			  struct device_list *devices)
 {
 	struct device *i;
-	u32 *lenp;
+	unsigned long *lenp;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
 	unsigned num = 0;
 
@@ -639,7 +733,7 @@ static struct device *new_device(struct 
 				 int fd,
 				 bool (*handle_input)(int, struct device *),
 				 unsigned long watch_off,
-				 u32 (*handle_output)(int,
+				 unsigned long (*handle_output)(int,
 						      const struct iovec *,
 						      unsigned,
 						      struct device *))
@@ -916,9 +1010,12 @@ int main(int argc, char *argv[])
 {
 	unsigned long mem, pgdir, start, page_offset;
 	int c, lguest_fd, waker_fd;
+	int elf_fd;
 	struct device_list device_list;
 	struct lguest_boot_info *boot = (void *)0;
 	const char *initrd_name = NULL;
+	/* 64-bit header is bigger, so this is a worst case for both */
+	Elf64_Ehdr ehdr;
 
 	device_list.max_infd = -1;
 	device_list.dev = NULL;
@@ -958,8 +1055,8 @@ int main(int argc, char *argv[])
 	map_zeroed_pages(0, mem / getpagesize());
 
 	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
-			    &page_offset);
+	elf_fd = open_or_die(argv[optind+1], O_RDONLY);
+	start = load_kernel(elf_fd, &page_offset, &ehdr);
 
 	/* Write the device descriptors into memory. */
 	map_device_descriptors(&device_list, mem);
@@ -969,7 +1066,11 @@ int main(int argc, char *argv[])
 		boot->initrd_size = load_initrd(initrd_name, mem);
 
 	/* Set up the initial linar pagetables. */
+#ifndef __x86_64__
 	pgdir = setup_pagetables(mem, boot->initrd_size, page_offset);
+#else
+	pgdir = find_pgt_symbol(elf_fd, &ehdr, page_offset);
+#endif
 
 	/* Give the guest the boot information it needs. */
 	concat(boot->cmdline, argv+optind+2);
Index: linux-2.6.20/Documentation/lguest/x86_64/defines
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/x86_64/defines
@@ -0,0 +1,4 @@
+# For now on x86_64 we'll hard code the location of the lguest binary loader.
+# But when we can get a relocatable kernel, we'll have to work to make this
+# dynamic.
+LGUEST_GUEST_TOP := 0x7f000000
Index: linux-2.6.20/Documentation/lguest/x86_64/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/x86_64/lguest_defs.h
@@ -0,0 +1,15 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+#include <asm/vsyscall.h>
+
+/* LGUEST_TOP_ADDRESS comes from the Makefile */
+typedef uint64_t u64;
+#include "../../../include/asm/lguest_user.h"
+
+#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP
+
+
+#define BOOT_PGTABLE "boot_level4_pgt"
+
+#endif
_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linux-foundation.org/mailman/listinfo/virtualization

[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux