On Mon, Jun 03, 2024 at 10:39:30AM +0200, Borislav Petkov wrote: > > +/* > > + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at > > + * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches > > + * to the identity mapping and the function has be present at the same spot in > > + * the virtual address space before and after switching page tables. > > + */ > > +static int __init init_transition_pgtable(pgd_t *pgd) > > This looks like a generic helper which should be in set_memory.c. And > looking at that file, there's populate_pgd() which does pretty much the > same thing, if I squint real hard. > > Let's tone down the duplication. Okay, there is a function called kernel_map_pages_in_pgd() in set_memory.c that does what we need here. I tried to use it, but encountered a few issues: - The code in set_memory.c allocates memory using the buddy allocator, which is not yet ready. We can work around this limitation by delaying the initialization of offlining until later, using a separate early_initcall(); - I noticed a complaint that the allocation is being done from an atomic context: a spinlock called cpa_lock is taken when populate_pgd() allocates memory. I am not sure why this was not noticed before. kernel_map_pages_in_pgd() has only been used in EFI mapping initialization so far, so maybe it is somehow special, I don't know. I was able to address this issue by switching cpa_lock to a mutex. However, this solution will only work if the callers for set_memory interfaces are not called from an atomic context. I need to verify if this is the case. - The function __flush_tlb_all() in kernel_(un)map_pages_in_pgd() must be called with preemption disabled. Once again, I am unsure why this has not caused issues in the EFI case. - I discovered a bug in kernel_ident_mapping_free() when it is used on a machine with 5-level paging. I will submit a proper patch to fix this issue. The fixup is below. Any comments? diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c index 6cfe762be28b..fbbfe78f7f27 100644 --- a/arch/x86/kernel/acpi/madt_wakeup.c +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -59,82 +59,55 @@ static void acpi_mp_cpu_die(unsigned int cpu) pr_err("Failed to hand over CPU %d to BIOS\n", cpu); } +static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake) +{ + cpu_hotplug_disable_offlining(); + + /* + * ACPI MADT doesn't allow to offline a CPU after it was onlined. This + * limits kexec: the second kernel won't be able to use more than one CPU. + * + * To prevent a kexec kernel from onlining secondary CPUs invalidate the + * mailbox address in the ACPI MADT wakeup structure which prevents a + * kexec kernel to use it. + * + * This is safe as the booting kernel has the mailbox address cached + * already and acpi_wakeup_cpu() uses the cached value to bring up the + * secondary CPUs. + * + * Note: This is a Linux specific convention and not covered by the + * ACPI specification. + */ + mp_wake->mailbox_address = 0; +} + /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */ static void __init *alloc_pgt_page(void *dummy) { - return memblock_alloc(PAGE_SIZE, PAGE_SIZE); + return (void *)get_zeroed_page(GFP_KERNEL); } static void __init free_pgt_page(void *pgt, void *dummy) { - return memblock_free(pgt, PAGE_SIZE); + return free_page((unsigned long)pgt); } -/* - * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at - * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches - * to the identity mapping and the function has be present at the same spot in - * the virtual address space before and after switching page tables. - */ -static int __init init_transition_pgtable(pgd_t *pgd) -{ - pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; - unsigned long vaddr, paddr; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - vaddr = (unsigned long)asm_acpi_mp_play_dead; - pgd += pgd_index(vaddr); - if (!pgd_present(*pgd)) { - p4d = (p4d_t *)alloc_pgt_page(NULL); - if (!p4d) - return -ENOMEM; - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); - } - p4d = p4d_offset(pgd, vaddr); - if (!p4d_present(*p4d)) { - pud = (pud_t *)alloc_pgt_page(NULL); - if (!pud) - return -ENOMEM; - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); - } - pud = pud_offset(p4d, vaddr); - if (!pud_present(*pud)) { - pmd = (pmd_t *)alloc_pgt_page(NULL); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - pmd = pmd_offset(pud, vaddr); - if (!pmd_present(*pmd)) { - pte = (pte_t *)alloc_pgt_page(NULL); - if (!pte) - return -ENOMEM; - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); - } - pte = pte_offset_kernel(pmd, vaddr); - - paddr = __pa(vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); - - return 0; -} - -static int __init acpi_mp_setup_reset(u64 reset_vector) +static int __init acpi_mp_setup_reset(union acpi_subtable_headers *header, + const unsigned long end) { + struct acpi_madt_multiproc_wakeup *mp_wake; struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .free_pgt_page = free_pgt_page, .page_flag = __PAGE_KERNEL_LARGE_EXEC, - .kernpg_flag = _KERNPG_TABLE_NOENC, + .kernpg_flag = _KERNPG_TABLE, }; + unsigned long vaddr, pfn; pgd_t *pgd; pgd = alloc_pgt_page(NULL); if (!pgd) - return -ENOMEM; + goto err; for (int i = 0; i < nr_pfn_mapped; i++) { unsigned long mstart, mend; @@ -143,30 +116,45 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) mend = pfn_mapped[i].end << PAGE_SHIFT; if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { kernel_ident_mapping_free(&info, pgd); - return -ENOMEM; + goto err; } } if (kernel_ident_mapping_init(&info, pgd, - PAGE_ALIGN_DOWN(reset_vector), - PAGE_ALIGN(reset_vector + 1))) { + PAGE_ALIGN_DOWN(acpi_mp_reset_vector_paddr), + PAGE_ALIGN(acpi_mp_reset_vector_paddr + 1))) { kernel_ident_mapping_free(&info, pgd); - return -ENOMEM; + goto err; } - if (init_transition_pgtable(pgd)) { + /* + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping + * at the same place as in the kernel page tables. + * + * asm_acpi_mp_play_dead() switches to the identity mapping and the + * function has be present at the same spot in the virtual address space + * before and after switching page tables. + */ + vaddr = (unsigned long)asm_acpi_mp_play_dead; + pfn = __pa(vaddr) >> PAGE_SHIFT; + if (kernel_map_pages_in_pgd(pgd, pfn, vaddr, 1, _KERNPG_TABLE)) { kernel_ident_mapping_free(&info, pgd); - return -ENOMEM; + goto err; } smp_ops.play_dead = acpi_mp_play_dead; smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu; smp_ops.cpu_die = acpi_mp_cpu_die; - acpi_mp_reset_vector_paddr = reset_vector; acpi_mp_pgd = __pa(pgd); return 0; +err: + pr_warn("Failed to setup MADT reset vector\n"); + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + acpi_mp_disable_offlining(mp_wake); + return -ENOMEM; + } static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) @@ -226,28 +214,6 @@ static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) return 0; } -static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake) -{ - cpu_hotplug_disable_offlining(); - - /* - * ACPI MADT doesn't allow to offline a CPU after it was onlined. This - * limits kexec: the second kernel won't be able to use more than one CPU. - * - * To prevent a kexec kernel from onlining secondary CPUs invalidate the - * mailbox address in the ACPI MADT wakeup structure which prevents a - * kexec kernel to use it. - * - * This is safe as the booting kernel has the mailbox address cached - * already and acpi_wakeup_cpu() uses the cached value to bring up the - * secondary CPUs. - * - * Note: This is a Linux specific convention and not covered by the - * ACPI specification. - */ - mp_wake->mailbox_address = 0; -} - int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, const unsigned long end) { @@ -274,10 +240,7 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 && mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) { - if (acpi_mp_setup_reset(mp_wake->reset_vector)) { - pr_warn("Failed to setup MADT reset vector\n"); - acpi_mp_disable_offlining(mp_wake); - } + acpi_mp_reset_vector_paddr = mp_wake->reset_vector; } else { /* * CPU offlining requires version 1 of the ACPI MADT wakeup @@ -290,3 +253,13 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, return 0; } + +static int __init acpi_mp_offline_init(void) +{ + if (!acpi_mp_reset_vector_paddr) + return 0; + + return acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_mp_setup_reset, 1); +} +early_initcall(acpi_mp_offline_init); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 3996af7b4abf..c45127265f2f 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -60,7 +60,7 @@ static void free_p4d(struct x86_mapping_info *info, pgd_t *pgd) } if (pgtable_l5_enabled()) - info->free_pgt_page(pgd, info->context); + info->free_pgt_page(p4d, info->context); } void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 443a97e515c0..72715674f492 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -69,7 +69,7 @@ static const int cpa_warn_level = CPA_PROTECT; * entries change the page attribute in parallel to some other cpu * splitting a large page entry along with changing the attribute. */ -static DEFINE_SPINLOCK(cpa_lock); +static DEFINE_MUTEX(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 @@ -1186,10 +1186,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, struct page *base; if (!debug_pagealloc_enabled()) - spin_unlock(&cpa_lock); + mutex_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) - spin_lock(&cpa_lock); + mutex_lock(&cpa_lock); if (!base) return -ENOMEM; @@ -1804,10 +1804,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) cpa->numpages = 1; if (!debug_pagealloc_enabled()) - spin_lock(&cpa_lock); + mutex_lock(&cpa_lock); ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled()) - spin_unlock(&cpa_lock); + mutex_unlock(&cpa_lock); if (ret) goto out; @@ -2516,7 +2516,9 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 1); + preempt_disable(); __flush_tlb_all(); + preempt_enable(); out: return retval; @@ -2551,7 +2553,9 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); retval = __change_page_attr_set_clr(&cpa, 1); + preempt_disable(); __flush_tlb_all(); + preempt_enable(); return retval; } -- Kiryl Shutsemau / Kirill A. Shutemov