Re: [PATCH v5 15/22] x86/virt/tdx: Allocate and set up PAMTs for TDMRs

Kai Huang <kai.huang@xxxxxxxxx> · Mon, 27 Jun 2022 22:31:36 +1200

On Fri, 2022-06-24 at 13:13 -0700, Dave Hansen wrote:
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 4988a91d5283..ec496e96d120 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST
> >  	depends on CPU_SUP_INTEL
> >  	depends on X86_64
> >  	depends on KVM_INTEL
> > +	depends on CONTIG_ALLOC
> >  	select ARCH_HAS_CC_PLATFORM
> >  	select ARCH_KEEP_MEMBLOCK
> >  	help
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index fd9f449b5395..36260dd7e69f 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -558,6 +558,196 @@ static int create_tdmrs(struct tdmr_info *tdmr_array,
> > int *tdmr_num)
> >  	return 0;
> >  }
> >  
> > +/* Page sizes supported by TDX */
> > +enum tdx_page_sz {
> > +	TDX_PG_4K,
> > +	TDX_PG_2M,
> > +	TDX_PG_1G,
> > +	TDX_PG_MAX,
> > +};
> 
> Are these the same constants as the magic numbers in Kirill's
> try_accept_one()?

try_accept_once() uses 'enum pg_level' PG_LEVEL_{4K,2M,1G} directly.  They can
be used directly too, but 'enum pg_level' has more than we need here:

enum pg_level {
        PG_LEVEL_NONE,                                                         
        PG_LEVEL_4K,                                                           
        PG_LEVEL_2M,                                                           
        PG_LEVEL_1G,
        PG_LEVEL_512G,                                                         
        PG_LEVEL_NUM                                                           
}; 

It has PG_LEVEL_NONE, so PG_LEVEL_4K starts with 1.

Below in tdmr_set_up_pamt(), I have two local arrays to store the base/size for
all TDX supported page sizes:

	unsigned long pamt_base[TDX_PG_MAX];
	unsigned long pamt_size[TDX_PG_MAX]; 

And a loop to calculate the size of PAMT for each page size:

	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
		...
	}

And later a similar loop to get the PAMT base of each page size too.

I can change them to:

	/*
	 * TDX only supports 4K, 2M and 1G page, but doesn't
	 * support 512G page size.
	 */
#define TDX_PG_LEVEL_MAX	PG_LEVEL_512G

	unsigned long pamt_base[TDX_PG_LEVEL_MAX];
	unsigned long pamt_size[TDX_PG_LEVEL_MAX];

And change the loop to:

	for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) {
		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
		...
	}

This would waste one 'unsigned long' for both pamt_base and pamt_size array, as
entry 0 isn't used for both of them.  Or we explicitly -1 array index:

	for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) {
		pamt_size[pgsz - 1] = tdmr_get_pamt_sz(tdmr, pgsz);
		...
	}

What's your opinion? 

> > +/*
> > + * Calculate PAMT size given a TDMR and a page size.  The returned
> > + * PAMT size is always aligned up to 4K page boundary.
> > + */
> > +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> > +				      enum tdx_page_sz pgsz)
> > +{
> > +	unsigned long pamt_sz;
> > +	int pamt_entry_nr;
> 
> 'nr_pamt_entries', please.

OK.

> 
> > +	switch (pgsz) {
> > +	case TDX_PG_4K:
> > +		pamt_entry_nr = tdmr->size >> PAGE_SHIFT;
> > +		break;
> > +	case TDX_PG_2M:
> > +		pamt_entry_nr = tdmr->size >> PMD_SHIFT;
> > +		break;
> > +	case TDX_PG_1G:
> > +		pamt_entry_nr = tdmr->size >> PUD_SHIFT;
> > +		break;
> > +	default:
> > +		WARN_ON_ONCE(1);
> > +		return 0;
> > +	}
> > +
> > +	pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size;
> > +	/* TDX requires PAMT size must be 4K aligned */
> > +	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> > +
> > +	return pamt_sz;
> > +}
> > +
> > +/*
> > + * Pick a NUMA node on which to allocate this TDMR's metadata.
> > + *
> > + * This is imprecise since TDMRs are 1G aligned and NUMA nodes might
> > + * not be.  If the TDMR covers more than one node, just use the _first_
> > + * one.  This can lead to small areas of off-node metadata for some
> > + * memory.
> > + */
> > +static int tdmr_get_nid(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long start_pfn, end_pfn;
> > +	int i, nid;
> > +
> > +	/* Find the first memory region covered by the TDMR */
> > +	memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, &nid)
> > {
> > +		if (end_pfn > (tdmr_start(tdmr) >> PAGE_SHIFT))
> > +			return nid;
> > +	}
> > +
> > +	/*
> > +	 * No memory region found for this TDMR.  It cannot happen since
> > +	 * when one TDMR is created, it must cover at least one (or
> > +	 * partial) memory region.
> > +	 */
> > +	WARN_ON_ONCE(1);
> > +	return 0;
> > +}
> 
> You should really describe what you are doing.  At first glance "return
> 0;" looks like "declare success".  How about something like this?
> 
> 	/*
> 	 * Fall back to allocating the TDMR from node 0 when no memblock
> 	 * can be found.  This should never happen since TDMRs originate
> 	 * from the memblocks.
> 	 */
> 
> Does that miss any of the points you were trying to make?

No. Your comments looks better and will use yours.  Thanks.

> 
> > +static int tdmr_set_up_pamt(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long pamt_base[TDX_PG_MAX];
> > +	unsigned long pamt_size[TDX_PG_MAX];
> > +	unsigned long tdmr_pamt_base;
> > +	unsigned long tdmr_pamt_size;
> > +	enum tdx_page_sz pgsz;
> > +	struct page *pamt;
> > +	int nid;
> > +
> > +	nid = tdmr_get_nid(tdmr);
> > +
> > +	/*
> > +	 * Calculate the PAMT size for each TDX supported page size
> > +	 * and the total PAMT size.
> > +	 */
> > +	tdmr_pamt_size = 0;
> > +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> > +		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
> > +		tdmr_pamt_size += pamt_size[pgsz];
> > +	}
> > +
> > +	/*
> > +	 * Allocate one chunk of physically contiguous memory for all
> > +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
> > +	 * in overlapped TDMRs.
> > +	 */
> > +	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
> > +			nid, &node_online_map);
> > +	if (!pamt)
> > +		return -ENOMEM;
> 
> I'm not sure it's worth mentioning, but this doesn't really need to be
> GFP_KERNEL.  __GFP_HIGHMEM would actually be just fine.  But,
> considering that this is 64-bit only, that's just a technicality.

> 
> > +	/* Calculate PAMT base and size for all supported page sizes. */
> 
> That comment isn't doing much good.  If you say anything here it should be:
> 
> 	/*
> 	 * Break the contiguous allocation back up into
> 	 * the individual PAMTs for each page size:
> 	 */
> 
> Also, this is *not* "calculating size".  That's done above.

Thanks will use this comment.

> 
> > +	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> > +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> > +		pamt_base[pgsz] = tdmr_pamt_base;
> > +		tdmr_pamt_base += pamt_size[pgsz];
> > +	}
> > +
> > +	tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
> > +	tdmr->pamt_4k_size = pamt_size[TDX_PG_4K];
> > +	tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
> > +	tdmr->pamt_2m_size = pamt_size[TDX_PG_2M];
> > +	tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
> > +	tdmr->pamt_1g_size = pamt_size[TDX_PG_1G];
> > +
> > +	return 0;
> > +}
> > 
> > +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
> > +			  unsigned long *pamt_npages)
> > +{
> > +	unsigned long pamt_base, pamt_sz;
> > +
> > +	/*
> > +	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
> > +	 * should always point to the beginning of that allocation.
> > +	 */
> > +	pamt_base = tdmr->pamt_4k_base;
> > +	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr-
> > >pamt_1g_size;
> > +
> > +	*pamt_pfn = pamt_base >> PAGE_SHIFT;
> > +	*pamt_npages = pamt_sz >> PAGE_SHIFT;
> > +}
> > +
> > +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long pamt_pfn, pamt_npages;
> > +
> > +	tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
> > +
> > +	/* Do nothing if PAMT hasn't been allocated for this TDMR */
> > +	if (!pamt_npages)
> > +		return;
> > +
> > +	if (WARN_ON_ONCE(!pamt_pfn))
> > +		return;
> > +
> > +	free_contig_range(pamt_pfn, pamt_npages);
> > +}
> > +
> > +static void tdmrs_free_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < tdmr_num; i++)
> > +		tdmr_free_pamt(tdmr_array_entry(tdmr_array, i));
> > +}
> > +
> > +/* Allocate and set up PAMTs for all TDMRs */
> > +static int tdmrs_set_up_pamt_all(struct tdmr_info *tdmr_array, int
> > tdmr_num)
> > +{
> > +	int i, ret = 0;
> > +
> > +	for (i = 0; i < tdmr_num; i++) {
> > +		ret = tdmr_set_up_pamt(tdmr_array_entry(tdmr_array, i));
> > +		if (ret)
> > +			goto err;
> > +	}
> > +
> > +	return 0;
> > +err:
> > +	tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> > +	return ret;
> > +}
> > +
> > +static unsigned long tdmrs_get_pamt_pages(struct tdmr_info *tdmr_array,
> > +					  int tdmr_num)
> 
> "get" is for refcounting.  tdmrs_count_pamt_pages() would be preferable.

Will use count.  Thanks.

> 
> > +{
> > +	unsigned long pamt_npages = 0;
> > +	int i;
> > +
> > +	for (i = 0; i < tdmr_num; i++) {
> > +		unsigned long pfn, npages;
> > +
> > +		tdmr_get_pamt(tdmr_array_entry(tdmr_array, i), &pfn,
> > &npages);
> > +		pamt_npages += npages;
> > +	}
> > +
> > +	return pamt_npages;
> > +}
> > +
> >  /*
> >   * Construct an array of TDMRs to cover all memory regions in memblock.
> >   * This makes sure all pages managed by the page allocator are TDX
> > @@ -572,8 +762,13 @@ static int construct_tdmrs_memeblock(struct tdmr_info
> > *tdmr_array,
> >  	if (ret)
> >  		goto err;
> >  
> > +	ret = tdmrs_set_up_pamt_all(tdmr_array, *tdmr_num);
> > +	if (ret)
> > +		goto err;
> > +
> >  	/* Return -EINVAL until constructing TDMRs is done */
> >  	ret = -EINVAL;
> > +	tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
> >  err:
> >  	return ret;
> >  }
> > @@ -644,6 +839,11 @@ static int init_tdx_module(void)
> >  	 * process are done.
> >  	 */
> >  	ret = -EINVAL;
> > +	if (ret)
> > +		tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> > +	else
> > +		pr_info("%lu pages allocated for PAMT.\n",
> > +				tdmrs_get_pamt_pages(tdmr_array,
> > tdmr_num));
> >  out_free_tdmrs:
> >  	/*
> >  	 * The array of TDMRs is freed no matter the initialization is
> 
> The rest looks OK.

Thanks.

-- 
Thanks,
-Kai