On 28/11/17 20:34, Maran Wilson wrote: > For certain applications it is desirable to rapidly boot a KVM virtual > machine. In cases where legacy hardware and software support within the > guest is not needed, Qemu should be able to boot directly into the > uncompressed Linux kernel binary without the need to run firmware. > > There already exists an ABI to allow this for Xen PVH guests and the ABI is > supported by Linux and FreeBSD: > > https://xenbits.xen.org/docs/unstable/misc/hvmlite.html > > This PoC patch enables Qemu to use that same entry point for booting KVM > guests. > > Even though the code is still PoC quality, I'm sending this as an RFC now > since there are a number of different ways the specific implementation > details can be handled. I chose a shared code path for Xen and KVM guests > but could just as easily create a separate code path that is advertised by > a different ELF note for KVM. There also seems to be some flexibility in > how the e820 table data is passed and how (or if) it should be identified > as e820 data. As a starting point, I've chosen the options that seem to > result in the smallest patch with minimal to no changes required of the > x86/HVM direct boot ABI. I like the idea. I'd rather split up the different hypervisor types early and use a common set of service functions instead of special casing xen_guest everywhere. This would make it much easier to support the KVM PVH boot without the need to configure the kernel with CONFIG_XEN. Another option would be to use the same boot path as with grub: set the boot params in zeropage and start at startup_32. Juergen > --- > arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------ > 1 file changed, 55 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c > index 98ab176..d93f711 100644 > --- a/arch/x86/xen/enlighten_pvh.c > +++ b/arch/x86/xen/enlighten_pvh.c > @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void) > acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; > } > > -static void __init init_pvh_bootparams(void) > +static void __init init_pvh_bootparams(bool xen_guest) > { > struct xen_memory_map memmap; > int rc; > > memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); > > - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > - if (rc) { > - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > - BUG(); > + if (xen_guest) { > + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > + if (rc) { > + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > + BUG(); > + } > + pvh_bootparams.e820_entries = memmap.nr_entries; > + } else if (pvh_start_info.nr_modules > 1) { > + /* The second module should be the e820 data for KVM guests */ > + struct hvm_modlist_entry *modaddr; > + char e820_sig[] = "e820 data"; > + struct boot_e820_entry *ep; > + struct e820_table *tp; > + char *cmdline_str; > + int idx; > + > + modaddr = __va(pvh_start_info.modlist_paddr + > + sizeof(struct hvm_modlist_entry)); > + cmdline_str = __va(modaddr->cmdline_paddr); > + > + if ((modaddr->cmdline_paddr) && > + (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) { > + tp = __va(modaddr->paddr); > + ep = (struct boot_e820_entry *)tp->entries; > + > + pvh_bootparams.e820_entries = tp->nr_entries; > + > + for (idx = 0; idx < tp->nr_entries ; idx++, ep++) > + pvh_bootparams.e820_table[idx] = *ep; > + } > } > - pvh_bootparams.e820_entries = memmap.nr_entries; > > if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { > pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = > @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void) > pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = > E820_TYPE_RESERVED; > pvh_bootparams.e820_entries++; > - } else > + } else if (xen_guest) { > xen_raw_printk("Warning: Can fit ISA range into e820\n"); > + } > > pvh_bootparams.hdr.cmd_line_ptr = > pvh_start_info.cmdline_paddr; > @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void) > * environment (i.e. hardware_subarch 0). > */ > pvh_bootparams.hdr.version = 0x212; > - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ > + pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0; > } > > /* > @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void) > */ > void __init xen_prepare_pvh(void) > { > - u32 msr; > + > + u32 msr = xen_cpuid_base(); > u64 pfn; > + bool xen_guest = msr ? true : false; > > if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { > - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", > - pvh_start_info.magic); > + if (xen_guest) > + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", > + pvh_start_info.magic); > BUG(); > } > > - xen_pvh = 1; > + if (xen_guest) { > + xen_pvh = 1; > + > + msr = cpuid_ebx(msr + 2); > + pfn = __pa(hypercall_page); > + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > + > + } else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) { > + BUG(); > + } > > - msr = cpuid_ebx(xen_cpuid_base() + 2); > - pfn = __pa(hypercall_page); > - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > + init_pvh_bootparams(xen_guest); > > - init_pvh_bootparams(); > + if (xen_guest) > + x86_init.oem.arch_setup = xen_pvh_arch_setup; > > - x86_init.oem.arch_setup = xen_pvh_arch_setup; > } >