On 1/24/24 4:55 AM, Kirill A. Shutemov wrote: > MADT Multiprocessor Wakeup structure version 1 brings support of CPU > offlining: BIOS provides a reset vector where the CPU has to jump to > for offlining itself. The new TEST mailbox command can be used to test > whether the CPU offlined itself which means the BIOS has control over > the CPU and can online it again via the ACPI MADT wakeup method. > > Add CPU offling support for the ACPI MADT wakeup method by implementing > custom cpu_die(), play_dead() and stop_this_cpu() SMP operations. > > CPU offlining makes is possible to hand over secondary CPUs over kexec, > not limiting the second kernel to a single CPU. > > The change conforms to the approved ACPI spec change proposal. See the > Link. > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > Link: https://lore.kernel.org/all/13356251.uLZWGnKmhe@kreacher > --- Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx> > arch/x86/include/asm/acpi.h | 2 + > arch/x86/kernel/acpi/Makefile | 2 +- > arch/x86/kernel/acpi/madt_playdead.S | 28 ++++ > arch/x86/kernel/acpi/madt_wakeup.c | 184 ++++++++++++++++++++++++++- > include/acpi/actbl2.h | 15 ++- > 5 files changed, 227 insertions(+), 4 deletions(-) > create mode 100644 arch/x86/kernel/acpi/madt_playdead.S > > diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h > index 2625b915ae7f..021cafa214c2 100644 > --- a/arch/x86/include/asm/acpi.h > +++ b/arch/x86/include/asm/acpi.h > @@ -81,6 +81,8 @@ union acpi_subtable_headers; > int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, > const unsigned long end); > > +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa); > + > /* > * Check if the CPU can handle C2 and deeper > */ > diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile > index 8c7329c88a75..37b1f28846de 100644 > --- a/arch/x86/kernel/acpi/Makefile > +++ b/arch/x86/kernel/acpi/Makefile > @@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o > obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o > obj-$(CONFIG_ACPI_APEI) += apei.o > obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o > -obj-$(CONFIG_X86_ACPI_MADT_WAKEUP) += madt_wakeup.o > +obj-$(CONFIG_X86_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o > > ifneq ($(CONFIG_ACPI_PROCESSOR),) > obj-y += cstate.o > diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S > new file mode 100644 > index 000000000000..4e498d28cdc8 > --- /dev/null > +++ b/arch/x86/kernel/acpi/madt_playdead.S > @@ -0,0 +1,28 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#include <linux/linkage.h> > +#include <asm/nospec-branch.h> > +#include <asm/page_types.h> > +#include <asm/processor-flags.h> > + > + .text > + .align PAGE_SIZE > + > +/* > + * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS > + * > + * rdi: Address of the ACPI MADT MPWK ResetVector > + * rsi: PGD of the identity mapping > + */ > +SYM_FUNC_START(asm_acpi_mp_play_dead) > + /* Turn off global entries. Following CR3 write will flush them. */ > + movq %cr4, %rdx > + andq $~(X86_CR4_PGE), %rdx > + movq %rdx, %cr4 > + > + /* Switch to identity mapping */ > + movq %rsi, %cr3 > + > + /* Jump to reset vector */ > + ANNOTATE_RETPOLINE_SAFE > + jmp *%rdi > +SYM_FUNC_END(asm_acpi_mp_play_dead) > diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c > index 30820f9de5af..9e984e2191ba 100644 > --- a/arch/x86/kernel/acpi/madt_wakeup.c > +++ b/arch/x86/kernel/acpi/madt_wakeup.c > @@ -1,10 +1,19 @@ > // SPDX-License-Identifier: GPL-2.0-or-later > #include <linux/acpi.h> > #include <linux/cpu.h> > +#include <linux/delay.h> > #include <linux/io.h> > +#include <linux/kexec.h> > +#include <linux/memblock.h> > +#include <linux/pgtable.h> > +#include <linux/sched/hotplug.h> > #include <asm/apic.h> > #include <asm/barrier.h> > +#include <asm/init.h> > +#include <asm/intel_pt.h> > +#include <asm/nmi.h> > #include <asm/processor.h> > +#include <asm/reboot.h> > > /* Physical address of the Multiprocessor Wakeup Structure mailbox */ > static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; > @@ -12,6 +21,154 @@ static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; > /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ > static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init; > > +static u64 acpi_mp_pgd __ro_after_init; > +static u64 acpi_mp_reset_vector_paddr __ro_after_init; > + > +static void acpi_mp_stop_this_cpu(void) > +{ > + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); > +} > + > +static void acpi_mp_play_dead(void) > +{ > + play_dead_common(); > + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); > +} > + > +static void acpi_mp_cpu_die(unsigned int cpu) > +{ > + u32 apicid = per_cpu(x86_cpu_to_apicid, cpu); > + unsigned long timeout; > + > + /* > + * Use TEST mailbox command to prove that BIOS got control over > + * the CPU before declaring it dead. > + * > + * BIOS has to clear 'command' field of the mailbox. > + */ > + acpi_mp_wake_mailbox->apic_id = apicid; > + smp_store_release(&acpi_mp_wake_mailbox->command, > + ACPI_MP_WAKE_COMMAND_TEST); > + > + /* Don't wait longer than a second. */ > + timeout = USEC_PER_SEC; > + while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout) > + udelay(1); > + > + if (!timeout) Nit: IMO, since you are dumping failure error message (not timeout message), you can use non zero acpi_mp_wake_mailbox->command check. But it is up to you. > + pr_err("Failed to hand over CPU %d to BIOS\n", cpu); > +} > + > +/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */ > +static void __init *alloc_pgt_page(void *dummy) > +{ > + return memblock_alloc(PAGE_SIZE, PAGE_SIZE); > +} > + > +static void __init free_pgt_page(void *pgt, void *dummy) > +{ > + return memblock_free(pgt, PAGE_SIZE); > +} > + > +/* > + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at > + * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches > + * to the identity mapping and the function has be present at the same spot in > + * the virtual address space before and after switching page tables. > + */ > +static int __init init_transition_pgtable(pgd_t *pgd) > +{ > + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; > + unsigned long vaddr, paddr; > + p4d_t *p4d; > + pud_t *pud; > + pmd_t *pmd; > + pte_t *pte; > + > + vaddr = (unsigned long)asm_acpi_mp_play_dead; > + pgd += pgd_index(vaddr); > + if (!pgd_present(*pgd)) { > + p4d = (p4d_t *)alloc_pgt_page(NULL); > + if (!p4d) > + return -ENOMEM; > + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); > + } > + p4d = p4d_offset(pgd, vaddr); > + if (!p4d_present(*p4d)) { > + pud = (pud_t *)alloc_pgt_page(NULL); > + if (!pud) > + return -ENOMEM; > + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); > + } > + pud = pud_offset(p4d, vaddr); > + if (!pud_present(*pud)) { > + pmd = (pmd_t *)alloc_pgt_page(NULL); > + if (!pmd) > + return -ENOMEM; > + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); > + } > + pmd = pmd_offset(pud, vaddr); > + if (!pmd_present(*pmd)) { > + pte = (pte_t *)alloc_pgt_page(NULL); > + if (!pte) > + return -ENOMEM; > + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); > + } > + pte = pte_offset_kernel(pmd, vaddr); > + > + paddr = __pa(vaddr); > + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); > + > + return 0; > +} > + > +static int __init acpi_mp_setup_reset(u64 reset_vector) > +{ > + pgd_t *pgd; > + struct x86_mapping_info info = { > + .alloc_pgt_page = alloc_pgt_page, > + .free_pgt_page = free_pgt_page, > + .page_flag = __PAGE_KERNEL_LARGE_EXEC, > + .kernpg_flag = _KERNPG_TABLE_NOENC, > + }; > + > + pgd = alloc_pgt_page(NULL); > + if (!pgd) > + return -ENOMEM; > + > + for (int i = 0; i < nr_pfn_mapped; i++) { > + unsigned long mstart, mend; > + > + mstart = pfn_mapped[i].start << PAGE_SHIFT; > + mend = pfn_mapped[i].end << PAGE_SHIFT; > + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { > + kernel_ident_mapping_free(&info, pgd); > + return -ENOMEM; > + } > + } > + > + if (kernel_ident_mapping_init(&info, pgd, > + PAGE_ALIGN_DOWN(reset_vector), > + PAGE_ALIGN(reset_vector + 1))) { > + kernel_ident_mapping_free(&info, pgd); > + return -ENOMEM; > + } > + > + if (init_transition_pgtable(pgd)) { > + kernel_ident_mapping_free(&info, pgd); > + return -ENOMEM; > + } > + > + smp_ops.play_dead = acpi_mp_play_dead; > + smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu; > + smp_ops.cpu_die = acpi_mp_cpu_die; > + > + acpi_mp_reset_vector_paddr = reset_vector; > + acpi_mp_pgd = __pa(pgd); > + > + return 0; > +} > + > static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) > { > if (!acpi_mp_wake_mailbox_paddr) { > @@ -97,14 +254,37 @@ int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, > struct acpi_madt_multiproc_wakeup *mp_wake; > > mp_wake = (struct acpi_madt_multiproc_wakeup *)header; > - if (BAD_MADT_ENTRY(mp_wake, end)) > + > + /* > + * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake > + * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger > + * than the actual size of the MP wakeup entry in ACPI table because the > + * 'reset_vector' is only available in the V1 MP wakeup structure. > + */ > + if (!mp_wake) > + return -EINVAL; > + if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0) > + return -EINVAL; > + if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0) > return -EINVAL; > > acpi_table_print_madt_entry(&header->common); > > acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address; > > - acpi_mp_disable_offlining(mp_wake); > + if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 && > + mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) { > + if (acpi_mp_setup_reset(mp_wake->reset_vector)) { > + pr_warn("Failed to setup MADT reset vector\n"); > + acpi_mp_disable_offlining(mp_wake); > + } > + } else { > + /* > + * CPU offlining requires version 1 of the ACPI MADT wakeup > + * structure. > + */ > + acpi_mp_disable_offlining(mp_wake); > + } > > apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); > > diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h > index e1a395af7591..2aedda70ef88 100644 > --- a/include/acpi/actbl2.h > +++ b/include/acpi/actbl2.h > @@ -1120,8 +1120,20 @@ struct acpi_madt_multiproc_wakeup { > u16 version; > u32 reserved; /* reserved - must be zero */ > u64 mailbox_address; > + u64 reset_vector; > }; > > +/* Values for Version field above */ > + > +enum acpi_madt_multiproc_wakeup_version { > + ACPI_MADT_MP_WAKEUP_VERSION_NONE = 0, > + ACPI_MADT_MP_WAKEUP_VERSION_V1 = 1, > + ACPI_MADT_MP_WAKEUP_VERSION_RESERVED = 2, /* 2 and greater are reserved */ > +}; > + > +#define ACPI_MADT_MP_WAKEUP_SIZE_V0 16 > +#define ACPI_MADT_MP_WAKEUP_SIZE_V1 24 > + > #define ACPI_MULTIPROC_WAKEUP_MB_OS_SIZE 2032 > #define ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE 2048 > > @@ -1134,7 +1146,8 @@ struct acpi_madt_multiproc_wakeup_mailbox { > u8 reserved_firmware[ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE]; /* reserved for firmware use */ > }; > > -#define ACPI_MP_WAKE_COMMAND_WAKEUP 1 > +#define ACPI_MP_WAKE_COMMAND_WAKEUP 1 > +#define ACPI_MP_WAKE_COMMAND_TEST 2 > > /* 17: CPU Core Interrupt Controller (ACPI 6.5) */ > -- Sathyanarayanan Kuppuswamy Linux Kernel Developer _______________________________________________ kexec mailing list kexec@xxxxxxxxxxxxxxxxxxx http://lists.infradead.org/mailman/listinfo/kexec