On Sat, Mar 13, 2021 at 06:01:19PM +0200, Jarkko Sakkinen wrote: > Background > ========== > > EPC section is covered by one or more SRAT entries that are associated with > one and only one PXM (NUMA node). The motivation behind this patch is to > provide basic elements of building allocation scheme based on this premise. > > Just like normal RAM, enclave memory (EPC) should be covered by entries > in the ACPI SRAT table. These entries allow each EPC section to be > associated with a NUMA node. > > Use this information to implement a simple NUMA-aware allocator for > enclave memory. > > Solution > ======== > > Use phys_to_target_node() to associate each NUMA node with the EPC > sections contained within its range. In sgx_alloc_epc_page(), first try > to allocate from the NUMA node, where the CPU is executing. If that > fails, allocate from other nodes, iterating them from the current node > in order. > > Other > ===== > > NUMA_KEEP_MEMINFO dependency is required for phys_to_target_node(). > > Link: https://lore.kernel.org/lkml/158188326978.894464.217282995221175417.stgit@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/ > Signed-off-by: Jarkko Sakkinen <jarkko@xxxxxxxxxx> We *can* have also epc_page->node by: - Considering the first section as the EPC of that node. - Printing a warning if more sections hit the same node, and ignoring them. - Merging sgx_numa_node and sgx_epc_section I think this would be a decent idea. I think it's a sane assumption that a node has a single EPC section, but it's good to have that warning just in case. I did not want to do this into this version because it's faster for me to refactor into this assumption than revert back. /Jarkko > --- > > v4: > * Cycle nodes instead of a global page list, starting from the node > of the current thread. > * Documented NUMA_KEEP_MEMINFO dependency to the commit message. > * Added NUMA node pointer to struct sgx_epc_section. EPC page should > reference to a section, since potentially a node could have multiple > sections (Intel SDM does not say anything explicit about this). > This the safest play. > * Remove nodes_clear(sgx_numa_node_mask). > * Appended Dave's additions to the commit message for the background > section. > > arch/x86/Kconfig | 1 + > arch/x86/kernel/cpu/sgx/main.c | 117 ++++++++++++++++++++------------- > arch/x86/kernel/cpu/sgx/sgx.h | 16 +++-- > 3 files changed, 84 insertions(+), 50 deletions(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 513895af8ee7..3e6152a8dd2b 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -1930,6 +1930,7 @@ config X86_SGX > depends on CRYPTO_SHA256=y > select SRCU > select MMU_NOTIFIER > + select NUMA_KEEP_MEMINFO if NUMA > help > Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions > that can be used by applications to set aside private regions of code > diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c > index cb4561444b96..3b524a1361d6 100644 > --- a/arch/x86/kernel/cpu/sgx/main.c > +++ b/arch/x86/kernel/cpu/sgx/main.c > @@ -18,14 +18,23 @@ static int sgx_nr_epc_sections; > static struct task_struct *ksgxd_tsk; > static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); > > -/* > - * These variables are part of the state of the reclaimer, and must be accessed > - * with sgx_reclaimer_lock acquired. > - */ > +/* The reclaimer lock protected variables prepend the lock. */ > static LIST_HEAD(sgx_active_page_list); > - > static DEFINE_SPINLOCK(sgx_reclaimer_lock); > > +/* The free page list lock protected variables prepend the lock. */ > +static unsigned long sgx_nr_free_pages; > + > +/* Nodes with one or more EPC sections. */ > +static nodemask_t sgx_numa_mask; > + > +/* > + * Array with one list_head for each possible NUMA node. Each > + * list contains all the sgx_epc_section's which are on that > + * node. > + */ > +static struct sgx_numa_node *sgx_numa_nodes; > + > /* > * When the driver initialized, EPC pages go first here, as they could be > * initialized to an active enclave, on kexec entry. > @@ -352,21 +361,9 @@ static void sgx_reclaim_pages(void) > } > } > > -static unsigned long sgx_nr_free_pages(void) > -{ > - unsigned long cnt = 0; > - int i; > - > - for (i = 0; i < sgx_nr_epc_sections; i++) > - cnt += sgx_epc_sections[i].free_cnt; > - > - return cnt; > -} > - > static bool sgx_should_reclaim(unsigned long watermark) > { > - return sgx_nr_free_pages() < watermark && > - !list_empty(&sgx_active_page_list); > + return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); > } > > static int ksgxd(void *p) > @@ -443,50 +440,63 @@ static bool __init sgx_page_reclaimer_init(void) > return true; > } > > -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) > +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) > { > - struct sgx_epc_page *page; > + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; > + struct sgx_epc_page *page = NULL; > + > + if (!node_isset(nid, sgx_numa_mask)) > + return NULL; > > - spin_lock(§ion->lock); > + spin_lock(&node->lock); > > - if (list_empty(§ion->page_list)) { > - spin_unlock(§ion->lock); > + if (list_empty(&node->free_page_list)) { > + spin_unlock(&node->lock); > return NULL; > } > > - page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); > + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); > list_del_init(&page->list); > - section->free_cnt--; > + sgx_nr_free_pages--; > + > + spin_unlock(&node->lock); > > - spin_unlock(§ion->lock); > return page; > } > > /** > * __sgx_alloc_epc_page() - Allocate an EPC page > * > - * Iterate through EPC sections and borrow a free EPC page to the caller. When a > - * page is no longer needed it must be released with sgx_free_epc_page(). > + * Iterate through NUMA nodes and borrow a free EPC page to the caller. When a > + * page is no longer needed it must be released with sgx_free_epc_page(). Start > + * from the NUMA node, where the caller is executing. > * > * Return: > - * an EPC page, > - * -errno on error > + * - an EPC page: Free EPC pages were available. > + * - ERR_PTR(-ENOMEM): Run out of EPC pages. > */ > struct sgx_epc_page *__sgx_alloc_epc_page(void) > { > - struct sgx_epc_section *section; > struct sgx_epc_page *page; > - int i; > + int nid = numa_node_id(); > > - for (i = 0; i < sgx_nr_epc_sections; i++) { > - section = &sgx_epc_sections[i]; > + /* Try to allocate EPC from the current node, first: */ > + page = __sgx_alloc_epc_page_from_node(nid); > + if (page) > + return page; > > - page = __sgx_alloc_epc_page_from_section(section); > + /* Then, go through the other nodes: */ > + while (true) { > + nid = next_node_in(nid, sgx_numa_mask); > + if (nid == numa_node_id()) > + break; > + > + page = __sgx_alloc_epc_page_from_node(nid); > if (page) > - return page; > + break; > } > > - return ERR_PTR(-ENOMEM); > + return page; > } > > /** > @@ -592,6 +602,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) > void sgx_free_epc_page(struct sgx_epc_page *page) > { > struct sgx_epc_section *section = &sgx_epc_sections[page->section]; > + struct sgx_numa_node *node = section->node; > int ret; > > WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); > @@ -600,10 +611,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page) > if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) > return; > > - spin_lock(§ion->lock); > - list_add_tail(&page->list, §ion->page_list); > - section->free_cnt++; > - spin_unlock(§ion->lock); > + spin_lock(&node->lock); > + > + list_add_tail(&page->list, &node->free_page_list); > + sgx_nr_free_pages++; > + > + spin_unlock(&node->lock); > } > > static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, > @@ -624,8 +637,6 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, > } > > section->phys_addr = phys_addr; > - spin_lock_init(§ion->lock); > - INIT_LIST_HEAD(§ion->page_list); > > for (i = 0; i < nr_pages; i++) { > section->pages[i].section = index; > @@ -634,7 +645,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, > list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); > } > > - section->free_cnt = nr_pages; > + sgx_nr_free_pages += nr_pages; > return true; > } > > @@ -653,8 +664,11 @@ static bool __init sgx_page_cache_init(void) > { > u32 eax, ebx, ecx, edx, type; > u64 pa, size; > + int nid; > int i; > > + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); > + > for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { > cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); > > @@ -677,6 +691,21 @@ static bool __init sgx_page_cache_init(void) > break; > } > > + nid = numa_map_to_online_node(phys_to_target_node(pa)); > + if (nid == NUMA_NO_NODE) { > + /* The physical address is already printed above. */ > + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); > + nid = 0; > + } > + > + if (!node_isset(nid, sgx_numa_mask)) { > + spin_lock_init(&sgx_numa_nodes[nid].lock); > + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); > + node_set(nid, sgx_numa_mask); > + } > + > + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; > + > sgx_nr_epc_sections++; > } > > diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h > index bc8af0428640..653af8ca1a25 100644 > --- a/arch/x86/kernel/cpu/sgx/sgx.h > +++ b/arch/x86/kernel/cpu/sgx/sgx.h > @@ -29,22 +29,26 @@ struct sgx_epc_page { > struct list_head list; > }; > > +/* > + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, > + * the free page list local to the node is stored here. > + */ > +struct sgx_numa_node { > + struct list_head free_page_list; > + spinlock_t lock; > +}; > + > /* > * The firmware can define multiple chunks of EPC to the different areas of the > * physical memory e.g. for memory areas of the each node. This structure is > * used to store EPC pages for one EPC section and virtual memory area where > * the pages have been mapped. > - * > - * 'lock' must be held before accessing 'page_list' or 'free_cnt'. > */ > struct sgx_epc_section { > unsigned long phys_addr; > void *virt_addr; > struct sgx_epc_page *pages; > - > - spinlock_t lock; > - struct list_head page_list; > - unsigned long free_cnt; > + struct sgx_numa_node *node; > }; > > extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; > -- > 2.30.2 >