> From: Michael Kelley (LINUX) <mikelley@xxxxxxxxxxxxx> > Sent: Monday, May 1, 2023 10:33 AM > ... > From: Dexuan Cui > > > > Add Hyper-V specific code so that a TDX guest can run on Hyper-V: > > No need to use hv_vp_assist_page. > > Don't use the unsafe Hyper-V TSC page. > > Don't try to use HV_REGISTER_CRASH_CTL. > > Don't trust Hyper-V's TLB-flushing hypercalls. > > Don't use lazy EOI. > > Share SynIC Event/Message pages and VMBus Monitor pages with the > > host. > > This patch no longer does anything with the VMBus monitor pages. Sorry, I forgot to update the commit log. Will drop this from the log. > > Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init(). > > The above line in the commit message is stale and can be dropped. Will drop this from the commit log. > > @@ -116,6 +117,7 @@ int hv_synic_alloc(void) > > { > > int cpu; > > struct hv_per_cpu_context *hv_cpu; > > + int ret = -ENOMEM; > > > > /* > > * First, zero all per-cpu memory areas so hv_synic_free() can > > @@ -159,6 +161,28 @@ int hv_synic_alloc(void) > > goto err; > > } > > } > > + > > + /* It's better to leak the page if the decryption fails. */ > > + if (hv_isolation_type_tdx()) { > > + ret = set_memory_decrypted( > > + (unsigned long)hv_cpu->synic_message_page, 1); > > + if (ret) { > > + pr_err("Failed to decrypt SYNIC msg page\n"); > > + hv_cpu->synic_message_page = NULL; > > + goto err; > > + } > > + > > + ret = set_memory_decrypted( > > + (unsigned long)hv_cpu->synic_event_page, 1); > > + if (ret) { > > + pr_err("Failed to decrypt SYNIC event page\n"); > > + hv_cpu->synic_event_page = NULL; > > + goto err; > > + } > > The error handling still doesn't work quite correctly. In the TDX case, upon > exiting this function, the synic_message_page and the synic_event_page > must > each either be mapped decrypted or be NULL. This requirement is so > that hv_synic_free() will do the right thing in changing the mapping back to > encrypted. hv_synic_free() can't handle a non-NULL page being encrypted. > > In the above code, if we fail to decrypt the synic_message_page, then setting > it to NULL will leak the page (which we'll live with) and ensures that > hv_synic_free() > will handle it correctly. But at that point we'll exit with synic_event_page > non-NULL and in the encrypted state, which hv_synic_free() can't handle. > > Michael Thanks for spotting the issue! I think the below extra changes should do the job: @@ -121,91 +121,102 @@ int hv_synic_alloc(void) /* * First, zero all per-cpu memory areas so hv_synic_free() can * detect what memory has been allocated and cleanup properly * after any failures. */ for_each_present_cpu(cpu) { hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); memset(hv_cpu, 0, sizeof(*hv_cpu)); } hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask), GFP_KERNEL); if (hv_context.hv_numa_map == NULL) { pr_err("Unable to allocate NUMA map\n"); goto err; } for_each_present_cpu(cpu) { hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); tasklet_init(&hv_cpu->msg_dpc, vmbus_on_msg_dpc, (unsigned long) hv_cpu); /* * Synic message and event pages are allocated by paravisor. * Skip these pages allocation here. */ if (!hv_isolation_type_snp() && !hv_root_partition) { hv_cpu->synic_message_page = (void *)get_zeroed_page(GFP_ATOMIC); if (hv_cpu->synic_message_page == NULL) { pr_err("Unable to allocate SYNIC message page\n"); goto err; } hv_cpu->synic_event_page = (void *)get_zeroed_page(GFP_ATOMIC); if (hv_cpu->synic_event_page == NULL) { pr_err("Unable to allocate SYNIC event page\n"); + + free_page((unsigned long)hv_cpu->synic_message_page); + hv_cpu->synic_message_page = NULL; + goto err; } } /* It's better to leak the page if the decryption fails. */ if (hv_isolation_type_tdx()) { ret = set_memory_decrypted( (unsigned long)hv_cpu->synic_message_page, 1); if (ret) { pr_err("Failed to decrypt SYNIC msg page\n"); hv_cpu->synic_message_page = NULL; + + /* + * Free the event page so that a TDX VM won't + * try to encrypt the page in hv_synic_free(). + */ + free_page((unsigned long)hv_cpu->synic_event_page); + hv_cpu->synic_event_page = NULL; goto err; } ret = set_memory_decrypted( (unsigned long)hv_cpu->synic_event_page, 1); if (ret) { pr_err("Failed to decrypt SYNIC event page\n"); hv_cpu->synic_event_page = NULL; goto err; } memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); } } return 0; err: /* * Any memory allocations that succeeded will be freed when * the caller cleans up by calling hv_synic_free() */ return ret; } I'm going to use the below (i.e. v5 + the above extra changes) in v6. Please let me know if there is still any bug. @@ -116,6 +117,7 @@ int hv_synic_alloc(void) { int cpu; struct hv_per_cpu_context *hv_cpu; + int ret = -ENOMEM; /* * First, zero all per-cpu memory areas so hv_synic_free() can @@ -156,9 +158,42 @@ int hv_synic_alloc(void) (void *)get_zeroed_page(GFP_ATOMIC); if (hv_cpu->synic_event_page == NULL) { pr_err("Unable to allocate SYNIC event page\n"); + + free_page((unsigned long)hv_cpu->synic_message_page); + hv_cpu->synic_message_page = NULL; + goto err; } } + + /* It's better to leak the page if the decryption fails. */ + if (hv_isolation_type_tdx()) { + ret = set_memory_decrypted( + (unsigned long)hv_cpu->synic_message_page, 1); + if (ret) { + pr_err("Failed to decrypt SYNIC msg page\n"); + hv_cpu->synic_message_page = NULL; + + /* + * Free the event page so that a TDX VM won't + * try to encrypt the page in hv_synic_free(). + */ + free_page((unsigned long)hv_cpu->synic_event_page); + hv_cpu->synic_event_page = NULL; + goto err; + } + + ret = set_memory_decrypted( + (unsigned long)hv_cpu->synic_event_page, 1); + if (ret) { + pr_err("Failed to decrypt SYNIC event page\n"); + hv_cpu->synic_event_page = NULL; + goto err; + } + + memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); + memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); + } } return 0; @@ -167,18 +202,40 @@ int hv_synic_alloc(void) * Any memory allocations that succeeded will be freed when * the caller cleans up by calling hv_synic_free() */ - return -ENOMEM; + return ret; } void hv_synic_free(void) { int cpu; + int ret; for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); + /* It's better to leak the page if the encryption fails. */ + if (hv_isolation_type_tdx()) { + if (hv_cpu->synic_message_page) { + ret = set_memory_encrypted((unsigned long) + hv_cpu->synic_message_page, 1); + if (ret) { + pr_err("Failed to encrypt SYNIC msg page\n"); + hv_cpu->synic_message_page = NULL; + } + } + + if (hv_cpu->synic_event_page) { + ret = set_memory_encrypted((unsigned long) + hv_cpu->synic_event_page, 1); + if (ret) { + pr_err("Failed to encrypt SYNIC event page\n"); + hv_cpu->synic_event_page = NULL; + } + } + } + free_page((unsigned long)hv_cpu->synic_event_page); free_page((unsigned long)hv_cpu->synic_message_page); } I'll post a separate patch (currently if hv_synic_alloc() --> get_zeroed_page() fails, hv_context.hv_numa_map is leaked): --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1515,27 +1515,27 @@ static int vmbus_bus_init(void) } ret = hv_synic_alloc(); if (ret) goto err_alloc; /* * Initialize the per-cpu interrupt state and stimer state. * Then connect to the host. */ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", hv_synic_init, hv_synic_cleanup); if (ret < 0) - goto err_cpuhp; + goto err_alloc; hyperv_cpuhp_online = ret; ret = vmbus_connect(); if (ret) goto err_connect; if (hv_is_isolation_supported()) sysctl_record_panic_msg = 0; /* * Only register if the crash MSRs are available */ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { @@ -1567,29 +1567,28 @@ static int vmbus_bus_init(void) /* * Always register the vmbus unload panic notifier because we * need to shut the VMbus channel connection on panic. */ atomic_notifier_chain_register(&panic_notifier_list, &hyperv_panic_vmbus_unload_block); vmbus_request_offers(); return 0; err_connect: cpuhp_remove_state(hyperv_cpuhp_online); -err_cpuhp: - hv_synic_free(); err_alloc: + hv_synic_free(); if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { free_percpu_irq(vmbus_irq, vmbus_evt); free_percpu(vmbus_evt); } err_setup: bus_unregister(&hv_bus); unregister_sysctl_table(hv_ctl_table_hdr); hv_ctl_table_hdr = NULL; return ret; }