The KVM page table is initialized with adding core memory mappings: the kernel text, the per-cpu memory, the kvm module, the cpu_entry_area, %esp fixup stacks, IRQ stacks. Signed-off-by: Alexandre Chartre <alexandre.chartre@xxxxxxxxxx> --- arch/x86/kernel/cpu/common.c | 2 + arch/x86/kvm/isolation.c | 131 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/isolation.h | 10 +++ include/linux/percpu.h | 2 + mm/percpu.c | 6 +- 5 files changed, 149 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3764054..0fa44b1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1511,6 +1511,8 @@ static __init int setup_clearcpuid(char *arg) EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_stack_ptr); + DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; diff --git a/arch/x86/kvm/isolation.c b/arch/x86/kvm/isolation.c index 2052abf..cf5ee0d 100644 --- a/arch/x86/kvm/isolation.c +++ b/arch/x86/kvm/isolation.c @@ -10,6 +10,8 @@ #include <linux/printk.h> #include <linux/slab.h> +#include <asm/cpu_entry_area.h> +#include <asm/processor.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> @@ -88,6 +90,8 @@ struct mm_struct kvm_mm = { DEFINE_STATIC_KEY_FALSE(kvm_isolation_enabled); EXPORT_SYMBOL(kvm_isolation_enabled); +static void kvm_isolation_uninit_page_table(void); +static void kvm_isolation_uninit_mm(void); static void kvm_clear_mapping(void *ptr, size_t size, enum page_table_level level); @@ -1024,10 +1028,130 @@ int kvm_copy_percpu_mapping(void *percpu_ptr, size_t size) EXPORT_SYMBOL(kvm_copy_percpu_mapping); +static int kvm_isolation_init_page_table(void) +{ + void *stack; + int cpu, rv; + + /* + * Copy the mapping for all the kernel text. We copy at the PMD + * level since the PUD is shared with the module mapping space. + */ + rv = kvm_copy_mapping((void *)__START_KERNEL_map, KERNEL_IMAGE_SIZE, + PGT_LEVEL_PMD); + if (rv) + goto out_uninit_page_table; + + /* copy the mapping of per cpu memory */ + rv = kvm_copy_mapping(pcpu_base_addr, pcpu_unit_size * pcpu_nr_units, + PGT_LEVEL_PMD); + if (rv) + goto out_uninit_page_table; + + /* + * Copy the mapping for cpu_entry_area and %esp fixup stacks + * (this is based on the PTI userland address space, but probably + * not needed because the KVM address space is not directly + * enterered from userspace). They can both be copied at the P4D + * level since they each have a dedicated P4D entry. + */ + rv = kvm_copy_mapping((void *)CPU_ENTRY_AREA_PER_CPU, P4D_SIZE, + PGT_LEVEL_P4D); + if (rv) + goto out_uninit_page_table; + +#ifdef CONFIG_X86_ESPFIX64 + rv = kvm_copy_mapping((void *)ESPFIX_BASE_ADDR, P4D_SIZE, + PGT_LEVEL_P4D); + if (rv) + goto out_uninit_page_table; +#endif + +#ifdef CONFIG_VMAP_STACK + /* + * Interrupt stacks are vmap'ed with guard pages, so we need to + * copy mappings. + */ + for_each_possible_cpu(cpu) { + stack = per_cpu(hardirq_stack_ptr, cpu); + pr_debug("IRQ Stack %px\n", stack); + if (!stack) + continue; + rv = kvm_copy_ptes(stack - IRQ_STACK_SIZE, IRQ_STACK_SIZE); + if (rv) + goto out_uninit_page_table; + } + +#endif + + /* copy mapping of the current module (kvm) */ + rv = kvm_copy_module_mapping(); + if (rv) + goto out_uninit_page_table; + + return 0; + +out_uninit_page_table: + kvm_isolation_uninit_page_table(); + return rv; +} + +/* + * Free all buffers used by the kvm page table. These buffers are stored + * in the kvm_pgt_dgroup_list. + */ +static void kvm_isolation_uninit_page_table(void) +{ + struct pgt_directory_group *dgroup, *dgroup_next; + enum page_table_level level; + void *ptr; + int i; + + mutex_lock(&kvm_pgt_dgroup_lock); + + list_for_each_entry_safe(dgroup, dgroup_next, + &kvm_pgt_dgroup_list, list) { + + for (i = 0; i < dgroup->count; i++) { + ptr = dgroup->directory[i].ptr; + level = dgroup->directory[i].level; + + switch (dgroup->directory[i].level) { + + case PGT_LEVEL_PTE: + kvm_pte_free(NULL, ptr); + break; + + case PGT_LEVEL_PMD: + kvm_pmd_free(NULL, ptr); + break; + + case PGT_LEVEL_PUD: + kvm_pud_free(NULL, ptr); + break; + + case PGT_LEVEL_P4D: + kvm_p4d_free(NULL, ptr); + break; + + default: + pr_err("unexpected page directory %d for %px\n", + level, ptr); + } + } + + list_del(&dgroup->list); + kfree(dgroup); + } + + mutex_unlock(&kvm_pgt_dgroup_lock); +} + static int kvm_isolation_init_mm(void) { pgd_t *kvm_pgd; gfp_t gfp_mask; + int rv; gfp_mask = GFP_KERNEL | __GFP_ZERO; kvm_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); @@ -1054,6 +1178,12 @@ static int kvm_isolation_init_mm(void) mm_init_cpumask(&kvm_mm); init_new_context(NULL, &kvm_mm); + rv = kvm_isolation_init_page_table(); + if (rv) { + kvm_isolation_uninit_mm(); + return rv; + } + return 0; } @@ -1065,6 +1195,7 @@ static void kvm_isolation_uninit_mm(void) destroy_context(&kvm_mm); + kvm_isolation_uninit_page_table(); kvm_free_all_range_mapping(); #ifdef CONFIG_PAGE_TABLE_ISOLATION diff --git a/arch/x86/kvm/isolation.h b/arch/x86/kvm/isolation.h index 3ef2060..1f79e28 100644 --- a/arch/x86/kvm/isolation.h +++ b/arch/x86/kvm/isolation.h @@ -3,6 +3,16 @@ #define ARCH_X86_KVM_ISOLATION_H #include <linux/kvm_host.h> +#include <linux/export.h> + +/* + * Copy the memory mapping for the current module. This is defined as a + * macro to ensure it is expanded in the module making the call so that + * THIS_MODULE has the correct value. + */ +#define kvm_copy_module_mapping() \ + (kvm_copy_ptes(THIS_MODULE->core_layout.base, \ + THIS_MODULE->core_layout.size)) DECLARE_STATIC_KEY_FALSE(kvm_isolation_enabled); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 70b7123..fb0ab9a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -70,6 +70,8 @@ extern void *pcpu_base_addr; extern const unsigned long *pcpu_unit_offsets; +extern int pcpu_unit_size; +extern int pcpu_nr_units; struct pcpu_group_info { int nr_units; /* aligned # of units */ diff --git a/mm/percpu.c b/mm/percpu.c index 68dd2e7..b68b3d8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -119,8 +119,10 @@ #endif /* CONFIG_SMP */ static int pcpu_unit_pages __ro_after_init; -static int pcpu_unit_size __ro_after_init; -static int pcpu_nr_units __ro_after_init; +int pcpu_unit_size __ro_after_init; +EXPORT_SYMBOL(pcpu_unit_size); +int pcpu_nr_units __ro_after_init; +EXPORT_SYMBOL(pcpu_nr_units); static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init; -- 1.7.1