Re: [PATCH v4 05/14] kexec: Add Kexec HandOver (KHO) generation helpers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Feb 11, 2025 at 12:37:20PM -0400, Jason Gunthorpe wrote:

> To do that you need to preserve folios as the basic primitive.

I made a small sketch of what I suggest.

I imagine the FDT schema for this would look something like this:

/dts-v1/;
/ {
  compatible = "linux-kho,v1";
  phys-addr-size = 64;
  void-p-size = 64;
  preserved-folio-map = <phys_addr>;

  // The per "driver" storage
  instance@1 {..};
  instance@2 {..};
};

I think this is alot better than what is in this series. It uses much
less memory when there are alot of allocation, it supports any order
folios, it is efficient for 1G guestmemfd folios, and it only needs a
few bytes in the FDT. It could preserve and restore the high order
folio struct page folding (HVO).

The use cases I'm imagining for drivers would be pushing gigabytes of
memory into this preservation mechanism. It needs to be scalable!

This also illustrates my point that I don't think FDT is a good
representation to use exclusively. This in-memory structure is much
better and faster than trying to represent the same information
embedded directly into the FDT. I imagine this to be the general
pattern that drivers will want to use. A few bytes in the FDT pointing
at a scalable in-memory structure for the bulk of the data.

/*
 * Keep track of folio memory that is to be preserved across KHO.
 *
 * This is designed with the idea that the system will have alot of memory, eg
 * 1TB, and the majority of it will be ~1G folios assigned to a hugetlb/etc
 * being used to back guest memory. This would leave a smaller amount of memory,
 * eg 16G, reserved for the hypervisor to use. The pages to preserve across KHO
 * would be randomly distributed over the hypervisor memory. The hypervisor
 * memory is not required to be contiguous.
 *
 * This approach is fully incremental, as the serialization progresses folios
 * can continue be aggregated to the tracker. The final step, immediately prior
 * to kexec would serialize the xarray information into a linked list for the
 * successor kernel to parse.
 *
 * The serializing side uses two levels of xarrays to manage chunks of per-order
 * 512 byte bitmaps. For instance the entire 1G order of a 1TB system would fit
 * inside a single 512 byte bitmap. For order 0 allocations each bitmap will
 * cover 16M of address space. Thus, for 16G of hypervisor memory at most 512K
 * of bitmap memory will be needed for order 0.
 */
struct kho_mem_track
{
	/* Points to kho_mem_phys, each order gets its own bitmap tree */
	struct xarray orders;
};

struct kho_mem_phys
{
	/*
	 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
	 * to order.
	 */
	struct xarray phys_bits;
};

#define PRESERVE_BITS (512 * 8)
struct kho_mem_phys_bits
{
	DECLARE_BITMAP(preserve, PRESERVE_BITS)
};

static void *
xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t elmsz)
{
	void *elm;
	void *res;

	elm = xa_load(xa, index);
	if (elm)
		return elm;

	elm = kzalloc(elmsz, GFP_KERNEL);
	if (!elm)
		return ERR_PTR(-ENOMEM);
	res = xa_cmpxchg(xa, elmsz, NULL, elm, GFP_KERNEL);
	if (xa_is_err(res)) {
		kfree(elm);
		return ERR_PTR(xa_err(res));
	};
	if (res != NULL) {
		kfree(elm);
		return res;
	}
	return elm;
}

/*
 * Record that the entire folio under virt is preserved across KHO. virt must
 * have come from alloc_pages/folio_alloc or similar and point to the first page
 * of the folio. The order will be preserved as well.
 */
int kho_preserve_folio(struct kho_mem_track *tracker, void *virt)
{
	struct folio *folio = virt_to_folio(virt);
	unsigned int order = folio_order(folio);
	phys_addr_t phys = virt_to_phys(virt);
	struct kho_mem_phys_bits *bits;
	struct kho_mem_phys *physxa;

	might_sleep();

	physxa = xa_load_or_alloc(&tracker->orders, order, sizeof(*physxa));
	if (IS_ERR(physxa))
		return PTR_ERR(physxa);

	phys >>= PAGE_SHIFT + order;
	static_assert(sizeof(phys_addr_t) <= sizeof(unsigned long));
	bits = xa_load_or_alloc(&physxa->phys_bits, phys / PRESERVE_BITS,
				sizeof(*bits));

	set_bit(phys % PRESERVE_BITS, bits->preserve);
	return 0;
}

#define KHOSER_PTR(type)  union {phys_addr_t phys; type ptr;}
#define KHOSER_STORE_PTR(dest, val)                 \
	({                                          \
		(dest).phys = virt_to_phys(val);    \
		typecheck(typeof((dest).ptr), val); \
	})
#define KHOSER_LOAD_PTR(src) ((typeof((src).ptr))(phys_to_virt((src).phys)))

struct khoser_mem_bitmap_ptr {
	phys_addr_t phys_start;
	KHOSER_PTR(struct kho_mem_phys_bits *) bitmap;
};

struct khoser_mem_chunk {
	unsigned int order;
	unsigned int num_elms;
	KHOSER_PTR(struct khoser_mem_chunk *) next;
	struct khoser_mem_bitmap_ptr
		bitmaps[(PAGE_SIZE - 16) / sizeof(struct khoser_mem_bitmap_ptr)];
};
static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);

static int new_chunk(struct khoser_mem_chunk **cur_chunk)
{
	struct khoser_mem_chunk *chunk;

	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
	if (!chunk)
		return -ENOMEM;
	if (*cur_chunk)
		KHOSER_STORE_PTR((*cur_chunk)->next, chunk);
	*cur_chunk = chunk;
	return 0;
}

/*
 * Record all the bitmaps in a linked list of pages for the next kernel to
 * process. Each chunk holds bitmaps of the same order and each block of bitmaps
 * starts at a given physical address. This allows the bitmaps to be sparse. The
 * xarray is used to store them in a tree while building up the data structure,
 * but the KHO successor kernel only needs to process them once in order.
 *
 * All of this memory is normal kmalloc() memory and is not marked for
 * preservation. The successor kernel will remain isolated to the scratch space
 * until it completes processing this list. Once processed all the memory
 * storing these ranges will be marked as free.
 */
int kho_serialize(struct kho_mem_track *tracker, phys_addr_t *fdt_value)
{
	struct khoser_mem_chunk *first_chunk = NULL;
	struct khoser_mem_chunk *chunk = NULL;
	struct kho_mem_phys *physxa;
	unsigned long order;
	int ret;

	xa_for_each(&tracker->orders, order, physxa) {
		struct kho_mem_phys_bits *bits;
		unsigned long phys;

		ret = new_chunk(&chunk);
		if (ret)
			goto err_free;
		if (!first_chunk)
			first_chunk = chunk;
		chunk->order = order;

		xa_for_each(&physxa->phys_bits, phys, bits) {
			struct khoser_mem_bitmap_ptr *elm;

			if (chunk->num_elms == ARRAY_SIZE(chunk->bitmaps)) {
				ret = new_chunk(&chunk);
				if (ret)
					goto err_free;
			}

			elm = &chunk->bitmaps[chunk->num_elms];
			chunk->num_elms++;
			elm->phys_start = phys << (order + PAGE_SIZE);
			KHOSER_STORE_PTR(elm->bitmap, bits);
		}
	}
	*fdt_value = virt_to_phys(first_chunk);
	return 0;
err_free:
	chunk = first_chunk;
	while (chunk) {
		struct khoser_mem_chunk *tmp = chunk;
		chunk = KHOSER_LOAD_PTR(chunk->next);
		kfree(tmp);
	}
	return ret;
}

static void preserve_bitmap(unsigned int order,
			    struct khoser_mem_bitmap_ptr *elm)
{
	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
	unsigned int bit;

	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
		phys_addr_t phys =
			elm->phys_start + (bit << (order +
			PAGE_SHIFT));

		// Do the struct page stuff..
	}
}

void kho_deserialize(phys_addr_t fdt_value)
{
	struct khoser_mem_chunk *chunk = phys_to_virt(fdt_value);

	while (chunk) {
		unsigned int i;

		for (i = 0; i != chunk->num_elms; i++)
			preserve_bitmap(chunk->order, chunk->bitmaps[i]);
		chunk = KHOSER_LOAD_PTR(chunk->next);
	}
}




[Index of Archives]     [Device Tree Compilter]     [Device Tree Spec]     [Linux Driver Backports]     [Video for Linux]     [Linux USB Devel]     [Linux PCI Devel]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Yosemite Backpacking]


  Powered by Linux