On Fri, 2022-06-24 at 13:13 -0700, Dave Hansen wrote: > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > > index 4988a91d5283..ec496e96d120 100644 > > --- a/arch/x86/Kconfig > > +++ b/arch/x86/Kconfig > > @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST > > depends on CPU_SUP_INTEL > > depends on X86_64 > > depends on KVM_INTEL > > + depends on CONTIG_ALLOC > > select ARCH_HAS_CC_PLATFORM > > select ARCH_KEEP_MEMBLOCK > > help > > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c > > index fd9f449b5395..36260dd7e69f 100644 > > --- a/arch/x86/virt/vmx/tdx/tdx.c > > +++ b/arch/x86/virt/vmx/tdx/tdx.c > > @@ -558,6 +558,196 @@ static int create_tdmrs(struct tdmr_info *tdmr_array, > > int *tdmr_num) > > return 0; > > } > > > > +/* Page sizes supported by TDX */ > > +enum tdx_page_sz { > > + TDX_PG_4K, > > + TDX_PG_2M, > > + TDX_PG_1G, > > + TDX_PG_MAX, > > +}; > > Are these the same constants as the magic numbers in Kirill's > try_accept_one()? try_accept_once() uses 'enum pg_level' PG_LEVEL_{4K,2M,1G} directly. They can be used directly too, but 'enum pg_level' has more than we need here: enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, PG_LEVEL_NUM }; It has PG_LEVEL_NONE, so PG_LEVEL_4K starts with 1. Below in tdmr_set_up_pamt(), I have two local arrays to store the base/size for all TDX supported page sizes: unsigned long pamt_base[TDX_PG_MAX]; unsigned long pamt_size[TDX_PG_MAX]; And a loop to calculate the size of PAMT for each page size: for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) { pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz); ... } And later a similar loop to get the PAMT base of each page size too. I can change them to: /* * TDX only supports 4K, 2M and 1G page, but doesn't * support 512G page size. */ #define TDX_PG_LEVEL_MAX PG_LEVEL_512G unsigned long pamt_base[TDX_PG_LEVEL_MAX]; unsigned long pamt_size[TDX_PG_LEVEL_MAX]; And change the loop to: for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) { pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz); ... } This would waste one 'unsigned long' for both pamt_base and pamt_size array, as entry 0 isn't used for both of them. Or we explicitly -1 array index: for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) { pamt_size[pgsz - 1] = tdmr_get_pamt_sz(tdmr, pgsz); ... } What's your opinion? > > +/* > > + * Calculate PAMT size given a TDMR and a page size. The returned > > + * PAMT size is always aligned up to 4K page boundary. > > + */ > > +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, > > + enum tdx_page_sz pgsz) > > +{ > > + unsigned long pamt_sz; > > + int pamt_entry_nr; > > 'nr_pamt_entries', please. OK. > > > + switch (pgsz) { > > + case TDX_PG_4K: > > + pamt_entry_nr = tdmr->size >> PAGE_SHIFT; > > + break; > > + case TDX_PG_2M: > > + pamt_entry_nr = tdmr->size >> PMD_SHIFT; > > + break; > > + case TDX_PG_1G: > > + pamt_entry_nr = tdmr->size >> PUD_SHIFT; > > + break; > > + default: > > + WARN_ON_ONCE(1); > > + return 0; > > + } > > + > > + pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size; > > + /* TDX requires PAMT size must be 4K aligned */ > > + pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); > > + > > + return pamt_sz; > > +} > > + > > +/* > > + * Pick a NUMA node on which to allocate this TDMR's metadata. > > + * > > + * This is imprecise since TDMRs are 1G aligned and NUMA nodes might > > + * not be. If the TDMR covers more than one node, just use the _first_ > > + * one. This can lead to small areas of off-node metadata for some > > + * memory. > > + */ > > +static int tdmr_get_nid(struct tdmr_info *tdmr) > > +{ > > + unsigned long start_pfn, end_pfn; > > + int i, nid; > > + > > + /* Find the first memory region covered by the TDMR */ > > + memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, &nid) > > { > > + if (end_pfn > (tdmr_start(tdmr) >> PAGE_SHIFT)) > > + return nid; > > + } > > + > > + /* > > + * No memory region found for this TDMR. It cannot happen since > > + * when one TDMR is created, it must cover at least one (or > > + * partial) memory region. > > + */ > > + WARN_ON_ONCE(1); > > + return 0; > > +} > > You should really describe what you are doing. At first glance "return > 0;" looks like "declare success". How about something like this? > > /* > * Fall back to allocating the TDMR from node 0 when no memblock > * can be found. This should never happen since TDMRs originate > * from the memblocks. > */ > > Does that miss any of the points you were trying to make? No. Your comments looks better and will use yours. Thanks. > > > +static int tdmr_set_up_pamt(struct tdmr_info *tdmr) > > +{ > > + unsigned long pamt_base[TDX_PG_MAX]; > > + unsigned long pamt_size[TDX_PG_MAX]; > > + unsigned long tdmr_pamt_base; > > + unsigned long tdmr_pamt_size; > > + enum tdx_page_sz pgsz; > > + struct page *pamt; > > + int nid; > > + > > + nid = tdmr_get_nid(tdmr); > > + > > + /* > > + * Calculate the PAMT size for each TDX supported page size > > + * and the total PAMT size. > > + */ > > + tdmr_pamt_size = 0; > > + for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) { > > + pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz); > > + tdmr_pamt_size += pamt_size[pgsz]; > > + } > > + > > + /* > > + * Allocate one chunk of physically contiguous memory for all > > + * PAMTs. This helps minimize the PAMT's use of reserved areas > > + * in overlapped TDMRs. > > + */ > > + pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, > > + nid, &node_online_map); > > + if (!pamt) > > + return -ENOMEM; > > I'm not sure it's worth mentioning, but this doesn't really need to be > GFP_KERNEL. __GFP_HIGHMEM would actually be just fine. But, > considering that this is 64-bit only, that's just a technicality. > > > + /* Calculate PAMT base and size for all supported page sizes. */ > > That comment isn't doing much good. If you say anything here it should be: > > /* > * Break the contiguous allocation back up into > * the individual PAMTs for each page size: > */ > > Also, this is *not* "calculating size". That's done above. Thanks will use this comment. > > > + tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; > > + for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) { > > + pamt_base[pgsz] = tdmr_pamt_base; > > + tdmr_pamt_base += pamt_size[pgsz]; > > + } > > + > > + tdmr->pamt_4k_base = pamt_base[TDX_PG_4K]; > > + tdmr->pamt_4k_size = pamt_size[TDX_PG_4K]; > > + tdmr->pamt_2m_base = pamt_base[TDX_PG_2M]; > > + tdmr->pamt_2m_size = pamt_size[TDX_PG_2M]; > > + tdmr->pamt_1g_base = pamt_base[TDX_PG_1G]; > > + tdmr->pamt_1g_size = pamt_size[TDX_PG_1G]; > > + > > + return 0; > > +} > > > > +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn, > > + unsigned long *pamt_npages) > > +{ > > + unsigned long pamt_base, pamt_sz; > > + > > + /* > > + * The PAMT was allocated in one contiguous unit. The 4K PAMT > > + * should always point to the beginning of that allocation. > > + */ > > + pamt_base = tdmr->pamt_4k_base; > > + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr- > > >pamt_1g_size; > > + > > + *pamt_pfn = pamt_base >> PAGE_SHIFT; > > + *pamt_npages = pamt_sz >> PAGE_SHIFT; > > +} > > + > > +static void tdmr_free_pamt(struct tdmr_info *tdmr) > > +{ > > + unsigned long pamt_pfn, pamt_npages; > > + > > + tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages); > > + > > + /* Do nothing if PAMT hasn't been allocated for this TDMR */ > > + if (!pamt_npages) > > + return; > > + > > + if (WARN_ON_ONCE(!pamt_pfn)) > > + return; > > + > > + free_contig_range(pamt_pfn, pamt_npages); > > +} > > + > > +static void tdmrs_free_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num) > > +{ > > + int i; > > + > > + for (i = 0; i < tdmr_num; i++) > > + tdmr_free_pamt(tdmr_array_entry(tdmr_array, i)); > > +} > > + > > +/* Allocate and set up PAMTs for all TDMRs */ > > +static int tdmrs_set_up_pamt_all(struct tdmr_info *tdmr_array, int > > tdmr_num) > > +{ > > + int i, ret = 0; > > + > > + for (i = 0; i < tdmr_num; i++) { > > + ret = tdmr_set_up_pamt(tdmr_array_entry(tdmr_array, i)); > > + if (ret) > > + goto err; > > + } > > + > > + return 0; > > +err: > > + tdmrs_free_pamt_all(tdmr_array, tdmr_num); > > + return ret; > > +} > > + > > +static unsigned long tdmrs_get_pamt_pages(struct tdmr_info *tdmr_array, > > + int tdmr_num) > > "get" is for refcounting. tdmrs_count_pamt_pages() would be preferable. Will use count. Thanks. > > > +{ > > + unsigned long pamt_npages = 0; > > + int i; > > + > > + for (i = 0; i < tdmr_num; i++) { > > + unsigned long pfn, npages; > > + > > + tdmr_get_pamt(tdmr_array_entry(tdmr_array, i), &pfn, > > &npages); > > + pamt_npages += npages; > > + } > > + > > + return pamt_npages; > > +} > > + > > /* > > * Construct an array of TDMRs to cover all memory regions in memblock. > > * This makes sure all pages managed by the page allocator are TDX > > @@ -572,8 +762,13 @@ static int construct_tdmrs_memeblock(struct tdmr_info > > *tdmr_array, > > if (ret) > > goto err; > > > > + ret = tdmrs_set_up_pamt_all(tdmr_array, *tdmr_num); > > + if (ret) > > + goto err; > > + > > /* Return -EINVAL until constructing TDMRs is done */ > > ret = -EINVAL; > > + tdmrs_free_pamt_all(tdmr_array, *tdmr_num); > > err: > > return ret; > > } > > @@ -644,6 +839,11 @@ static int init_tdx_module(void) > > * process are done. > > */ > > ret = -EINVAL; > > + if (ret) > > + tdmrs_free_pamt_all(tdmr_array, tdmr_num); > > + else > > + pr_info("%lu pages allocated for PAMT.\n", > > + tdmrs_get_pamt_pages(tdmr_array, > > tdmr_num)); > > out_free_tdmrs: > > /* > > * The array of TDMRs is freed no matter the initialization is > > The rest looks OK. Thanks. -- Thanks, -Kai