> > This patch adds the guest implementation in order to maintain the list of > pages which are freed by the guest and are not reused. To avoid any > reallocation it includes seqlock once the list is completely filled. > Though it doesn't carries the hypercall related changes. > > Signed-off-by: Nitesh Narayan Lal <nilal@xxxxxxxxxx> > --- > virt/kvm/page_hinting.c | 246 > +++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 244 insertions(+), 2 deletions(-) > > diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c > index 39d2b1d..cfdc513 100644 > --- a/virt/kvm/page_hinting.c > +++ b/virt/kvm/page_hinting.c > @@ -3,7 +3,7 @@ > #include <linux/page_ref.h> > #include <linux/kvm_host.h> > #include <linux/sort.h> > - > +#include <linux/kernel.h> > #include kvm.h> > > #define MAX_FGPT_ENTRIES 1000 > @@ -33,14 +33,256 @@ struct hypervisor_pages { > unsigned int pages; > }; > > +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(guest_page_lock); > DEFINE_PER_CPU(struct kvm_free_pages [MAX_FGPT_ENTRIES], kvm_pt); > DEFINE_PER_CPU(int, kvm_pt_idx); > -struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES]; > +struct hypervisor_pages hypervisor_pagelist[MAX_FGPT_ENTRIES - 1]; > + > +static void empty_hyperlist(void) > +{ > + int i = 0; > + > + while (i < MAX_FGPT_ENTRIES - 1) { > + hypervisor_pagelist[i].pfn = 0; > + hypervisor_pagelist[i].pages = 0; > + i++; > + } > +} > + > +static void make_hypercall(void) > +{ > + /* > + * Dummy function: Tobe filled later. > + */ > + empty_hyperlist(); > +} > + > +static int sort_pfn(const void *a1, const void *b1) > +{ > + const struct hypervisor_pages *a = a1; > + const struct hypervisor_pages *b = b1; > + > + if (a->pfn > b->pfn) > + return 1; > + > + if (a->pfn < b->pfn) > + return -1; > + > + return 0; > +} > + > +static int pack_hyperlist(void) > +{ > + int i = 0, j = 0; > + > + while (i < MAX_FGPT_ENTRIES - 1) { > + if (hypervisor_pagelist[i].pfn != 0) { > + hypervisor_pagelist[j].pfn = > + hypervisor_pagelist[i].pfn; > + hypervisor_pagelist[j].pages = > + hypervisor_pagelist[i].pages; > + j++; > + } > + i++; > + } > + i = j; > + while (j < MAX_FGPT_ENTRIES - 1) { > + hypervisor_pagelist[j].pfn = 0; > + hypervisor_pagelist[j].pages = 0; > + j++; > + } > + return i; > +} > + > +int compress_hyperlist(void) > +{ > + int i = 0, j = 1, merge_counter = 0, ret = 0; > + > + sort(hypervisor_pagelist, MAX_FGPT_ENTRIES - 1, > + sizeof(struct hypervisor_pages), sort_pfn, NULL); > + while (i < MAX_FGPT_ENTRIES - 1 && j < MAX_FGPT_ENTRIES - 1) { > + unsigned long pfni = hypervisor_pagelist[i].pfn; > + unsigned int pagesi = hypervisor_pagelist[i].pages; > + unsigned long pfnj = hypervisor_pagelist[j].pfn; > + unsigned int pagesj = hypervisor_pagelist[j].pages; > + > + if (pfnj <= pfni) { > + if (((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) && > + ((pfnj + pagesj - 1) >= (pfni - 1))) { > + hypervisor_pagelist[i].pfn = pfnj; > + hypervisor_pagelist[i].pages += pfni - pfnj; > + hypervisor_pagelist[j].pfn = 0; > + hypervisor_pagelist[j].pages = 0; > + j++; > + merge_counter++; > + continue; > + } else if ((pfnj + pagesj - 1) > (pfni + pagesi - 1)) { > + hypervisor_pagelist[i].pfn = pfnj; > + hypervisor_pagelist[i].pages = pagesj; > + hypervisor_pagelist[j].pfn = 0; > + hypervisor_pagelist[j].pages = 0; > + j++; > + merge_counter++; > + continue; > + } > + } > + if (pfnj > pfni) { > + if ((pfnj + pagesj - 1) > (pfni + pagesi - 1) && > + (pfnj <= pfni + pagesi)) { > + hypervisor_pagelist[i].pages += > + (pfnj + pagesj - 1) - > + (pfni + pagesi - 1); > + hypervisor_pagelist[j].pfn = 0; > + hypervisor_pagelist[j].pages = 0; > + j++; > + merge_counter++; > + continue; > + } > + if ((pfnj + pagesj - 1) <= (pfni + pagesi - 1)) { > + hypervisor_pagelist[j].pfn = 0; > + hypervisor_pagelist[j].pages = 0; > + j++; > + merge_counter++; > + continue; > + } > + } > + i = j; > + j++; > + } > + if (merge_counter != 0) > + ret = pack_hyperlist() - 1; > + else > + ret = MAX_FGPT_ENTRIES - 1; > + return ret; > +} > + > +void copy_hyperlist(int hyper_idx) > +{ > + int *idx = &get_cpu_var(kvm_pt_idx); > + struct kvm_free_pages *free_page_obj; > + int i = 0; > + > + free_page_obj = &get_cpu_var(kvm_pt)[0]; > + while (i < hyper_idx) { > + free_page_obj[*idx].pfn = hypervisor_pagelist[i].pfn; > + free_page_obj[*idx].pages = hypervisor_pagelist[i].pages; > + *idx += 1; > + i++; > + } > + empty_hyperlist(); > + put_cpu_var(kvm_pt); > + put_cpu_var(kvm_pt_idx); > +} > + > +/* > + * arch_free_page_slowpath() - This function adds the guest free page > entries > + * to hypervisor_pages list and also ensures defragmentation prior to > addition > + * if it is present with any entry of the kvm_free_pages list. > + */ > +void arch_free_page_slowpath(void) > +{ > + int idx = 0; > + int hyper_idx = -1; > + int *kvm_idx = &get_cpu_var(kvm_pt_idx); > + struct kvm_free_pages *free_page_obj = &get_cpu_var(kvm_pt)[0]; > + > + write_seqlock(&guest_page_lock); > + while (idx < MAX_FGPT_ENTRIES) { > + unsigned long pfn = free_page_obj[idx].pfn; > + unsigned long pfn_end = free_page_obj[idx].pfn + > + free_page_obj[idx].pages - 1; > + bool prev_free = false; > + > + while (pfn <= pfn_end) { > + struct page *p = pfn_to_page(pfn); just one thing here, if we want/planning to use pmem as RAM this might not work, because pfn will not have corresponding 'struct page'? > + > + if (PageCompound(p)) { > + struct page *head_page = compound_head(p); > + unsigned long head_pfn = page_to_pfn(head_page); > + unsigned int alloc_pages = > + 1 << compound_order(head_page); > + > + pfn = head_pfn + alloc_pages; > + prev_free = false; > + continue; > + } > + if (page_ref_count(p)) { > + pfn++; > + prev_free = false; > + continue; > + } > + /* > + * The page is free so add it to the list and free the > + * hypervisor_pagelist if required. > + */ > + if (!prev_free) { > + hyper_idx++; > + if (hyper_idx == MAX_FGPT_ENTRIES - 1) { > + hyper_idx = compress_hyperlist(); > + if (hyper_idx >= > + HYPERLIST_THRESHOLD - 1) { > + make_hypercall(); > + hyper_idx = 0; > + } > + } > + hypervisor_pagelist[hyper_idx].pfn = pfn; > + hypervisor_pagelist[hyper_idx].pages = 1; > + /* > + * If the next contiguous page is free, it can > + * be added to this same entry. > + */ > + prev_free = true; > + } else { > + /* > + * Multiple adjacent free pages > + */ > + hypervisor_pagelist[hyper_idx].pages++; > + } > + pfn++; > + } > + free_page_obj[idx].pfn = 0; > + free_page_obj[idx].pages = 0; > + idx++; > + } > + *kvm_idx = 0; > + put_cpu_var(kvm_pt); > + put_cpu_var(kvm_pt_idx); > + write_sequnlock(&guest_page_lock); > +} > > void arch_alloc_page(struct page *page, int order) > { > + unsigned int seq; > + > + /* > + * arch_free_page will acquire the lock once the list carrying guest > + * free pages is full and a hypercall will be made. Until complete free > + * page list is traversed no further allocaiton will be allowed. > + */ > + do { > + seq = read_seqbegin(&guest_page_lock); > + } while (read_seqretry(&guest_page_lock, seq)); > } > > void arch_free_page(struct page *page, int order) > { > + int *free_page_idx = &get_cpu_var(kvm_pt_idx); > + struct kvm_free_pages *free_page_obj; > + unsigned long flags; > + > + /* > + * use of global variables may trigger a race condition between irq and > + * process context causing unwanted overwrites. This will be replaced > + * with a better solution to prevent such race conditions. > + */ > + local_irq_save(flags); > + free_page_obj = &get_cpu_var(kvm_pt)[0]; > + free_page_obj[*free_page_idx].pfn = page_to_pfn(page); > + free_page_obj[*free_page_idx].pages = 1 << order; > + *free_page_idx += 1; > + if (*free_page_idx == MAX_FGPT_ENTRIES) > + arch_free_page_slowpath(); > + put_cpu_var(kvm_pt); > + put_cpu_var(kvm_pt_idx); > + local_irq_restore(flags); > } > -- > 2.9.4 > >