Re: [RFC][Patch V7 2/7] KVM: Guest page hinting functionality

Nitesh Narayan Lal <nilal@xxxxxxxxxx> · Mon, 11 Jun 2018 17:27:40 -0400



On 06/11/2018 03:34 PM, Luiz Capitulino wrote:
> On Mon, 11 Jun 2018 11:18:57 -0400
> nilal@xxxxxxxxxx wrote:
>
>> From: Nitesh Narayan Lal <nilal@xxxxxxxxxx>
>>
>> This patch adds the guest implementation in order to maintain the list of
>> pages which are freed by the guest and are not reused. To avoid any
>> reallocation it includes seqlock once the list is completely filled.
>> Though it doesn't carries the hypercall related changes.
>>
>> Signed-off-by: Nitesh Narayan Lal <nilal@xxxxxxxxxx>
>> ---
>>  virt/kvm/page_hinting.c | 287 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 287 insertions(+)
>>  create mode 100644 virt/kvm/page_hinting.c
>>
>> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
>> new file mode 100644
>> index 0000000..77dc865
>> --- /dev/null
>> +++ b/virt/kvm/page_hinting.c
>> @@ -0,0 +1,287 @@
>> +#include <linux/gfp.h>
>> +#include <linux/mm.h>
>> +#include <linux/page_ref.h>
>> +#include <linux/kvm_host.h>
>> +#include <linux/sort.h>
>> +#include <linux/kernel.h>
>> +
>> +#define MAX_FGPT_ENTRIES	1000
>> +#define HYPERLIST_THRESHOLD	1	/* FIXME: find a good threshold */
>> +/*
>> + * struct kvm_free_pages - Tracks the pages which are freed by the guest.
>> + * @pfn	- page frame number for the page which is to be freed
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to hold the list of pfn and number of pages
>> + * which are freed by the guest. This list may also have fragmentated pages so
>> + * defragmentation is a must prior to the hypercall.
>> + */
>> +struct kvm_free_pages {
>> +	unsigned long pfn;
>> +	unsigned int pages;
>> +};
>> +
>> +/*
>> + * hypervisor_pages - It is a dummy structure passed with the hypercall.
>> + * @pfn - page frame number for the page which is to be freed.
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to to hold the list of pfn and pages and is
>> + * passed as part of the hypercall.
>> + */
>> +struct hypervisor_pages {
>> +	unsigned long pfn;
>> +	unsigned int pages;
>> +};
> Maybe rename 'pages' to 'nr_pages' in both structs?
Yeap, I can do this.
>
> More important comments below :)
>
>> +
>> +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock);
>> +DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt);
>> +DEFINE_PER_CPU(int, kvm_pt_idx);
>> +struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES];
>> +
>> +static void empty_hyperlist(void)
>> +{
>> +	int i = 0;
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		hypervisor_pagelist[i].pfn = 0;
>> +		hypervisor_pagelist[i].pages = 0;
>> +		i++;
>> +	}
>> +}
>> +
>> +static void make_hypercall(void)
>> +{
>> +	/*
>> +	 * Dummy function: Tobe filled later.
>> +	 */
>> +	empty_hyperlist();
>> +}
>> +
>> +static int sort_pfn(const void *a1, const void *b1)
>> +{
>> +	const struct hypervisor_pages *a = a1;
>> +	const struct hypervisor_pages *b = b1;
>> +
>> +	if (a->pfn > b->pfn)
>> +		return 1;
>> +
>> +	if (a->pfn < b->pfn)
>> +		return -1;
>> +
>> +	return 0;
>> +}
>> +
>> +static int pack_hyperlist(void)
>> +{
>> +	int i = 0, j = 0;
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		if (hypervisor_pagelist[i].pfn != 0) {
>> +			if (i != j) {
>> +				hypervisor_pagelist[j].pfn =
>> +						hypervisor_pagelist[i].pfn;
>> +				hypervisor_pagelist[j].pages =
>> +						hypervisor_pagelist[i].pages;
>> +			}
>> +			j++;
>> +		}
>> +		i++;
>> +	}
>> +	i = j;
>> +	while (j < MAX_FGPT_ENTRIES) {
>> +		hypervisor_pagelist[j].pfn = 0;
>> +		hypervisor_pagelist[j].pages = 0;
>> +		j++;
>> +	}
>> +	return i;
>> +}
>> +
>> +int compress_hyperlist(void)
>> +{
>> +	int i = 0, j = 1, merge_counter = 0, ret = 0;
>> +
>> +	sort(hypervisor_pagelist, MAX_FGPT_ENTRIES,
>> +	     sizeof(struct hypervisor_pages), sort_pfn, NULL);
>> +	while (i < MAX_FGPT_ENTRIES && j < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfni = hypervisor_pagelist[i].pfn;
>> +		unsigned int pagesi = hypervisor_pagelist[i].pages;
>> +		unsigned long pfnj = hypervisor_pagelist[j].pfn;
>> +		unsigned int pagesj = hypervisor_pagelist[j].pages;
>> +
>> +		if (pfnj <= pfni) {
>> +			if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) &&
>> +			    ((pfnj + pagesj - 1) >= (pfni - 1))) {
>> +				hypervisor_pagelist[i].pfn = pfnj;
>> +				hypervisor_pagelist[i].pages += pfni - pfnj;
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			} else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) {
>> +				hypervisor_pagelist[i].pfn = pfnj;
>> +				hypervisor_pagelist[i].pages = pagesj;
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			}
>> +		} else if (pfnj > pfni) {
>> +			if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) &&
>> +			    (pfnj <= pfni + pagesi)) {
>> +				hypervisor_pagelist[i].pages +=
>> +						(pfnj + pagesj - 1) -
>> +						(pfni + pagesi - 1);
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			} else if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) {
>> +				hypervisor_pagelist[j].pfn = 0;
>> +				hypervisor_pagelist[j].pages = 0;
>> +				j++;
>> +				merge_counter++;
>> +				continue;
>> +			}
>> +		}
>> +		i = j;
>> +		j++;
>> +	}
>> +	if (merge_counter != 0)
>> +		ret = pack_hyperlist() - 1;
>> +	else
>> +		ret = MAX_FGPT_ENTRIES - 1;
>> +	return ret;
>> +}
>> +
>> +void copy_hyperlist(int hyper_idx)
>> +{
>> +	int *idx = &get_cpu_var(kvm_pt_idx);
>> +	struct kvm_free_pages *free_page_obj;
>> +	int i = 0;
>> +
>> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +	while (i < hyper_idx) {
>> +		free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn;
>> +		free_page_obj[*idx].pages = hypervisor_pagelist[i].pages;
>> +		*idx += 1;
>> +		i++;
>> +	}
>> +	empty_hyperlist();
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +}
>> +
>> +/*
>> + * arch_free_page_slowpath() - This function adds the guest free page entries
>> + * to hypervisor_pages list and also ensures defragmentation prior to addition
>> + * if it is present with any entry of the kvm_free_pages list.
>> + */
>> +void arch_free_page_slowpath(void)
>> +{
>> +	int idx = 0;
>> +	int hyper_idx = -1;
>> +	int *kvm_idx = &get_cpu_var(kvm_pt_idx);
>> +	struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +
>> +	write_seqlock(&guest_page_lock);
> I think this lock is used to synchronize CPUs wanting to allocate pages
> with the CPU doing the free page reporting, right? In other words, once
> a CPU takes this lock no other CPU will be able to allocate pages in the
> guest. Is this correct?
Yes, that is correct. I have used the lock to prevent any allocations.
>
> I'm wondering whether we could use some page flag for this purpose. I mean,
> the race condition here seems to be: once we decide to report a page as
> free, the guest can't use that page until the host has received and acked
> this page.
>
> So, is there any page flag we could use which:
>
> 1. Will prevent the page from being allocated when set
> 2. Won't mess up with the page allocator when setting it at free time
>    in arch_free_page()
>
> I don't know this myself.
I am not sure about this, I will try to find out.
>> +	while (idx < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfn = free_page_obj[idx].pfn;
>> +		unsigned long pfn_end = free_page_obj[idx].pfn +
>> +					free_page_obj[idx].pages - 1;
>> +		bool prev_free = false;
>> +
>> +		while (pfn <= pfn_end) {
>> +			struct page *p = pfn_to_page(pfn);
>> +
>> +			if (PageCompound(p)) {
>> +				struct page *head_page = compound_head(p);
>> +				unsigned long head_pfn = page_to_pfn(head_page);
>> +				unsigned int alloc_pages =
>> +					1 << compound_order(head_page);
>> +
>> +				pfn = head_pfn + alloc_pages;
>> +				prev_free = false;
>> +				continue;
>> +			}
>> +			if (page_ref_count(p)) {
>> +				pfn++;
>> +				prev_free = false;
>> +				continue;
>> +			}
>> +			/*
>> +			 * The page is free so add it to the list and free the
>> +			 * hypervisor_pagelist if required.
>> +			 */
>> +			if (!prev_free) {
>> +				hyper_idx++;
>> +				hypervisor_pagelist[hyper_idx].pfn = pfn;
>> +				hypervisor_pagelist[hyper_idx].pages = 1;
>> +				if (hyper_idx == MAX_FGPT_ENTRIES - 1) {
>> +					hyper_idx =  compress_hyperlist();
>> +					if (hyper_idx >=
>> +					    HYPERLIST_THRESHOLD) {
>> +						make_hypercall();
>> +						hyper_idx = 0;
>> +					}
> Do we have a clear definition of when free pages are reported to the host?
>
> If I'm understanding the code correctly, the following conditions have
> to be met:
>
> 1. The free_page_obj[] array fills up at least once (ie. we have
>    MAX_FGPT_ENTRIES entries at least once)
>
> 2. We skip pages that might already be freed and merge contiguous regions
>    in the same array entry
>
> 3. We only report free pages to the host when the number of non-contiguous
>    free regions reaches HYPERLIST_THRESHOLD
>
> If that's correct, then one problem with that approach is that we might
> report some megas (if the memory being freed by the guest is fragmented)
> or we might also report several GBs at once (if the memory being freed by
> the guest happens to be contiguous).
>
> Does it make sense to instead allow users to configure the granularity they
> want the guest to give to the host? For example, users could do:
Yes, this is the reason why I have changed the THRESHOLD value to 1 for
now. I have to either allow the user to configure this value through
qemu console or via sysctl interface or I have to add another condition
and constant such as Memory Threshold to take care of this situation.
>
> (qemu) set-page-hinting 1GB
>
> So, every time a CPU has 1GB free it would report it to the host. Of course,
> this method also raise its own set of questions:
>
>  - What's a good default?
>  - How users can choose a good value for their workloads?
I am not entirely sure about this.
>
>> +				}
>> +				/*
>> +				 * If the next contiguous page is free, it can
>> +				 * be added to this same entry.
>> +				 */
>> +				prev_free = true;
>> +			} else {
>> +				/*
>> +				 * Multiple adjacent free pages
>> +				 */
>> +				hypervisor_pagelist[hyper_idx].pages++;
>> +			}
>> +			pfn++;
>> +		}
>> +		free_page_obj[idx].pfn = 0;
>> +		free_page_obj[idx].pages = 0;
>> +		idx++;
>> +	}
>> +	*kvm_idx = 0;
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +	write_sequnlock(&guest_page_lock);
>> +}
>> +
>> +void arch_alloc_page(struct page *page, int order)
>> +{
>> +	unsigned int seq;
>> +
>> +	/*
>> +	 * arch_free_page will acquire the lock once the list carrying guest
>> +	 * free pages is full and a hypercall will be made. Until complete free
>> +	 * page list is traversed no further allocaiton will be allowed.
>> +	 */
>> +	do {
>> +		seq = read_seqbegin(&guest_page_lock);
>> +	} while (read_seqretry(&guest_page_lock, seq));
>> +}
>> +
>> +void arch_free_page(struct page *page, int order)
>> +{
>> +	int *free_page_idx = &get_cpu_var(kvm_pt_idx);
> Since you're disabling interrupts, I think you can use this_cpu_ptr()
> instead of get_cpu_var() (from inside the atomic section). And maybe pass
> free_page_idx and free_page_obj to arch_free_pages() to simplify a bit.
>
> But please ignore this if you like the suggestion below.
>
>> +	struct kvm_free_pages *free_page_obj;
>> +	unsigned long flags;
>> +
>> +	/*
>> +	 * use of global variables may trigger a race condition between irq and
>> +	 * process context causing unwanted overwrites. This will be replaced
>> +	 * with a better solution to prevent such race conditions.
>> +	 */
>> +	local_irq_save(flags);
> So, this is disabling interrupts on the CPU for the entire reporting
> duration. I have no idea how long this process takes to tell whether
> or not we're disabling interrupts for too long. But I think disabling
> interrupts is protecting against free_page_obj being accessed from
> interrupt context, right?
Yes, I introduced this because there was a race condition between
process context and the irq context.
>
> If that's correct and if we want to reduce the time duration in which
> interrupts are disabled, then the following idea might work:
>
> 1. Disable interrupts only to add the page's PFN to free_page_obj[]
> 2. Only call arch_free_page_slowpath() if !in_interrupt() is true
>
> We also have to protect against preemption, but get_cpu_var() will
> take care of this. If we don't want to disable preemption for this
> long either, then maybe we could use a mutex instead (in which case
> I guess you probably want to skip the whole thing if the mutex is
> already taken).
I can try using mutex here. Thank you for the suggestion.
>
>> +	free_page_obj = &get_cpu_var(kvm_pt)[0];
>> +	free_page_obj[*free_page_idx].pfn = page_to_pfn(page);
>> +	free_page_obj[*free_page_idx].pages = 1 << order;
>> +	*free_page_idx += 1;
>> +	if (*free_page_idx == MAX_FGPT_ENTRIES)
>> +		arch_free_page_slowpath();
>> +	put_cpu_var(kvm_pt);
>> +	put_cpu_var(kvm_pt_idx);
>> +	local_irq_restore(flags);
>> +}

-- 
Regards
Nitesh

Attachment:
signature.asc

Description: OpenPGP digital signature