Re: [RFC PATCH] KVM: x86: Allow Qemu/KVM to use PVH entry point

Andrew Cooper <andrew.cooper3@xxxxxxxxxx> · Tue, 28 Nov 2017 19:41:29 +0000

On 28/11/17 19:34, Maran Wilson wrote:
> For certain applications it is desirable to rapidly boot a KVM virtual
> machine. In cases where legacy hardware and software support within the
> guest is not needed, Qemu should be able to boot directly into the
> uncompressed Linux kernel binary without the need to run firmware.
>
> There already exists an ABI to allow this for Xen PVH guests and the ABI is
> supported by Linux and FreeBSD:
>
>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html

Just FYI, this link has recently become stale, following some cleanup. 
The document is now:

https://xenbits.xen.org/docs/unstable/misc/pvh.html

~Andrew

>
> This PoC patch enables Qemu to use that same entry point for booting KVM
> guests.
>
> Even though the code is still PoC quality, I'm sending this as an RFC now
> since there are a number of different ways the specific implementation
> details can be handled. I chose a shared code path for Xen and KVM guests
> but could just as easily create a separate code path that is advertised by
> a different ELF note for KVM. There also seems to be some flexibility in
> how the e820 table data is passed and how (or if) it should be identified
> as e820 data. As a starting point, I've chosen the options that seem to
> result in the smallest patch with minimal to no changes required of the
> x86/HVM direct boot ABI.
> ---
>  arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
>  1 file changed, 55 insertions(+), 19 deletions(-)
>
> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> index 98ab176..d93f711 100644
> --- a/arch/x86/xen/enlighten_pvh.c
> +++ b/arch/x86/xen/enlighten_pvh.c
> @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void)
>  		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
>  }
>  
> -static void __init init_pvh_bootparams(void)
> +static void __init init_pvh_bootparams(bool xen_guest)
>  {
>  	struct xen_memory_map memmap;
>  	int rc;
>  
>  	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
>  
> -	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> -	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> -	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> -	if (rc) {
> -		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> -		BUG();
> +	if (xen_guest) {
> +		memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> +		set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> +		rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> +		if (rc) {
> +			xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> +			BUG();
> +		}
> +		pvh_bootparams.e820_entries = memmap.nr_entries;
> +	} else if (pvh_start_info.nr_modules > 1) {
> +		/* The second module should be the e820 data for KVM guests */
> +		struct hvm_modlist_entry *modaddr;
> +		char e820_sig[] = "e820 data";
> +		struct boot_e820_entry *ep;
> +		struct e820_table *tp;
> +		char *cmdline_str;
> +		int idx;
> +
> +		modaddr = __va(pvh_start_info.modlist_paddr +
> +			       sizeof(struct hvm_modlist_entry));
> +		cmdline_str = __va(modaddr->cmdline_paddr);
> +
> +		if ((modaddr->cmdline_paddr) &&
> +		    (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) {
> +			tp = __va(modaddr->paddr);
> +			ep = (struct boot_e820_entry *)tp->entries;
> +
> +			pvh_bootparams.e820_entries = tp->nr_entries;
> +
> +			for (idx = 0; idx < tp->nr_entries ; idx++, ep++)
> +				pvh_bootparams.e820_table[idx] = *ep;
> +		}
>  	}
> -	pvh_bootparams.e820_entries = memmap.nr_entries;
>  
>  	if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
>  		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
> @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void)
>  		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
>  			E820_TYPE_RESERVED;
>  		pvh_bootparams.e820_entries++;
> -	} else
> +	} else if (xen_guest) {
>  		xen_raw_printk("Warning: Can fit ISA range into e820\n");
> +	}
>  
>  	pvh_bootparams.hdr.cmd_line_ptr =
>  		pvh_start_info.cmdline_paddr;
> @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void)
>  	 * environment (i.e. hardware_subarch 0).
>  	 */
>  	pvh_bootparams.hdr.version = 0x212;
> -	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
> +	pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0;
>  }
>  
>  /*
> @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void)
>   */
>  void __init xen_prepare_pvh(void)
>  {
> -	u32 msr;
> +
> +	u32 msr = xen_cpuid_base();
>  	u64 pfn;
> +	bool xen_guest = msr ? true : false;
>  
>  	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
> -		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> -				pvh_start_info.magic);
> +		if (xen_guest)
> +			xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> +					pvh_start_info.magic);
>  		BUG();
>  	}
>  
> -	xen_pvh = 1;
> +	if (xen_guest) {
> +		xen_pvh = 1;
> +
> +		msr = cpuid_ebx(msr + 2);
> +		pfn = __pa(hypercall_page);
> +		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> +	} else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) {
> +		BUG();
> +	}
>  
> -	msr = cpuid_ebx(xen_cpuid_base() + 2);
> -	pfn = __pa(hypercall_page);
> -	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +	init_pvh_bootparams(xen_guest);
>  
> -	init_pvh_bootparams();
> +	if (xen_guest)
> +		x86_init.oem.arch_setup = xen_pvh_arch_setup;
>  
> -	x86_init.oem.arch_setup = xen_pvh_arch_setup;
>  }