Re: [PATCH v6 06/12] s390/mm: Add huge page dirty sync support

David Hildenbrand <david@xxxxxxxxxx> · Fri, 13 Jul 2018 10:16:33 +0200

On 13.07.2018 08:36, Janosch Frank wrote:
> From: Janosch Frank <frankja@xxxxxxxxxxxxxxxxxx>
> 
> To do dirty loging with huge pages, we protect huge pmds in the
> gmap. When they are written to, we unprotect them and mark them dirty.
> 
> We introduce the function gmap_test_and_clear_dirty_segment which
> handles dirty sync for huge pages.
> 
> Signed-off-by: Janosch Frank <frankja@xxxxxxxxxxxxx>
> ---
>  arch/s390/include/asm/gmap.h |   3 ++
>  arch/s390/kvm/kvm-s390.c     |  19 ++++---
>  arch/s390/mm/gmap.c          | 119 ++++++++++++++++++++++++++++++++++++++++++-
>  arch/s390/mm/pgtable.c       |   6 ---
>  4 files changed, 132 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
> index 276268b48aff..f923ed27ac6e 100644
> --- a/arch/s390/include/asm/gmap.h
> +++ b/arch/s390/include/asm/gmap.h
> @@ -15,6 +15,7 @@
>  
>  /* Status bits only for huge segment entries */
>  #define _SEGMENT_ENTRY_GMAP_IN		0x8000	/* invalidation notify bit */
> +#define _SEGMENT_ENTRY_GMAP_UC		0x4000	/* user dirty (migration) */
>  
>  /**
>   * struct gmap_struct - guest address space
> @@ -139,4 +140,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
>  int gmap_mprotect_notify(struct gmap *, unsigned long start,
>  			 unsigned long len, int prot);
>  
> +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
> +			     unsigned long gaddr, unsigned long vmaddr);
>  #endif /* _ASM_S390_GMAP_H */
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 3b7a5151b6a5..6acc46cc7f7f 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -511,19 +511,24 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  }
>  
>  static void kvm_s390_sync_dirty_log(struct kvm *kvm,
> -					struct kvm_memory_slot *memslot)
> +				    struct kvm_memory_slot *memslot)
>  {
>  	gfn_t cur_gfn, last_gfn;
> -	unsigned long address;
> +	unsigned long gaddr, vmaddr;
> +	unsigned long *dirty = memslot->dirty_bitmap;
>  	struct gmap *gmap = kvm->arch.gmap;
>  
> -	/* Loop over all guest pages */
> +	/* Loop over all guest segments */
> +	cur_gfn = memslot->base_gfn;
>  	last_gfn = memslot->base_gfn + memslot->npages;
> -	for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
> -		address = gfn_to_hva_memslot(memslot, cur_gfn);
> +	for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES, dirty += 4) {
> +		gaddr = gfn_to_gpa(cur_gfn);
> +		vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
> +		if (kvm_is_error_hva(vmaddr))
> +			continue;
> +
> +		gmap_sync_dirty_log_pmd(gmap, dirty, gaddr, vmaddr);

I am not a friend of this interface. If ever somebody decides to change
anything in mark_page_dirty(), this will most probably be missed. You're
basically copying the logic we have in that function because you're not
allowed to pass "struct kvm" itself.

Isn't there a way to call mark_page_dirty() from here with the new
interface? E.g. let gmap_sync_dirty_log_pmd() indicate whether a single
page or the whole segment is dirty. You can then advance either
_PAGE_ENTRIES or only a single page.

OR (which would be performance wise better):

Create your own temporary zeroed bitmap (#_PAGE_ENTRIES bits - 256bit ==
4 * sizeof(unsigned long)) and pass that instead. Then go over all bits
and call mark_page_dirty(). At least that looks cleaner to me (no need
to fiddle with bitmaps we don't control - e.g. not having to use
set_bit_le() or having to check if the bitmap pointer is actually valid).

>  
> -		if (test_and_clear_guest_dirty(gmap->mm, address))
> -			mark_page_dirty(kvm, cur_gfn);
>  		if (fatal_signal_pending(current))
>  			return;
>  		cond_resched();
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index a6d499d2b24b..90e2d2f0e298 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -15,6 +15,7 @@
>  #include <linux/swapops.h>
>  #include <linux/ksm.h>
>  #include <linux/mman.h>
> +#include <linux/hugetlb.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/pgalloc.h>
> @@ -521,6 +522,9 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
>  	rcu_read_unlock();
>  }
>  
> +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
> +			   unsigned long gaddr);
> +
>  /**
>   * gmap_link - set up shadow page tables to connect a host to a guest address
>   * @gmap: pointer to guest mapping meta data structure
> @@ -541,6 +545,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
>  	p4d_t *p4d;
>  	pud_t *pud;
>  	pmd_t *pmd;
> +	pmd_t unprot;
>  	int rc;
>  
>  	BUG_ON(gmap_is_shadow(gmap));
> @@ -598,12 +603,19 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
>  				       vmaddr >> PMD_SHIFT, table);
>  		if (!rc) {
>  			if (pmd_large(*pmd)) {
> -				*table = pmd_val(*pmd) &
> -					_SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
> +				*table = (pmd_val(*pmd) &
> +					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
> +					| _SEGMENT_ENTRY_GMAP_UC;
>  			} else
>  				*table = pmd_val(*pmd) &
>  					_SEGMENT_ENTRY_HARDWARE_BITS;
>  		}
> +	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
> +		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
> +		unprot = __pmd((*table & (_SEGMENT_ENTRY_HARDWARE_BITS_LARGE
> +					  & ~_SEGMENT_ENTRY_PROTECT))
> +			       | _SEGMENT_ENTRY_GMAP_UC);

Maybe split this into several lines to make this more readable.

uint64_t unprot;

unprot = *table & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
unprot &= ~_SEGMENT_ENTRY_PROTECT;
unprot |= _SEGMENT_ENTRY_GMAP_UC;

... __pmd(unprot) ...

Do we have to check here some place if the original pmd is actually
allowed to write? (e.g. what about read-only mappings ?) Or is that
handled by the fault handler already?

> +		gmap_pmdp_xchg(gmap, (pmd_t *)table, unprot, gaddr);

Also, I wonder if gmap_pmdp_xchg should take care of cleaning notifier
bits from the new pmd value itself? It calls all notifiers and new
notifiers can only be registered by protecting again. So masking of
_SEGMENT_ENTRY_HARDWARE_BITS_LARGE is not really needed if done in
gmap_pmdp_xchg.

>  	}
>  	spin_unlock(&gmap->guest_table_lock);
>  	spin_unlock(ptl);
> @@ -928,11 +940,23 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
>  {
>  	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
>  	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
> +	pmd_t new = *pmdp;
>  
>  	/* Fixup needed */
>  	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
>  		return -EAGAIN;
>  
> +	if (prot == PROT_NONE && !pmd_i) {
> +		pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
> +		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
> +	}
> +
> +	if (prot == PROT_READ && !pmd_p) {
> +		pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
> +		pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
> +		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
> +	}
> +
>  	if (bits & GMAP_NOTIFY_MPROT)
>  		pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
>  
> @@ -2219,6 +2243,13 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
>  }
>  EXPORT_SYMBOL_GPL(ptep_notify);
>  
> +static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
> +			     unsigned long gaddr)

gmap_pmdp_notify?

> +{
> +	pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
> +	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
> +}
> +
>  /**
>   * pmdp_notify - call all invalidation callbacks for a specific pmd
>   * @mm: pointer to the process mm_struct
> @@ -2249,6 +2280,31 @@ void pmdp_notify(struct mm_struct *mm, unsigned long vmaddr)
>  }
>  EXPORT_SYMBOL_GPL(pmdp_notify);
>  
> +/**
> + * gmap_pmdp_xchg - exchange a gmap pmd with another
> + * @gmap: pointer to the guest address space structure
> + * @pmdp: pointer to the pmd entry
> + * @new: replacement entry
> + * @gaddr: the affected guest address
> + *
> + * This function is assumed to be called with the guest_table_lock
> + * held.
> + */
> +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
> +			   unsigned long gaddr)
> +{
> +	gaddr &= HPAGE_MASK;
> +	pmdp_notify_gmap(gmap, pmdp, gaddr);
> +	if (MACHINE_HAS_TLB_GUEST)
> +		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
> +			    IDTE_GLOBAL);
> +	else if (MACHINE_HAS_IDTE)
> +		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
> +	else
> +		__pmdp_csp(pmdp);
> +	*pmdp = new;
> +}
> +
>  static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
>  			    int purge)
>  {
> @@ -2366,6 +2422,65 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
>  }
>  EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
>  
> +/**
> + * gmap_test_and_clear_dirty_segment - test and reset segment dirty status
> + * @gmap: pointer to guest address space
> + * @pmdp: pointer to the pmd to be tested
> + * @gaddr: virtual address in the guest address space
> + *
> + * This function is assumed to be called with the guest_table_lock
> + * held.
> + */
> +bool gmap_test_and_clear_dirty_segment(struct gmap *gmap, pmd_t *pmdp,
> +				       unsigned long gaddr)
> +{
> +	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
> +		return false;
> +
> +	/* Already protected memory, which did not change is clean */
> +	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
> +	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
> +		return false;
> +
> +	/* Clear UC indication and reset protection */
> +	pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
> +	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
> +	return true;
> +}
> +
> +/**
> + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
> + * @gmap: pointer to guest address space
> + * @bitmap: dirty bitmap for this pmd
> + * @gaddr: virtual address in the guest address space
> + * @vmaddr: virtual address in the host address space
> + *
> + * This function is assumed to be called with the guest_table_lock
> + * held.
> + */
> +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
> +			     unsigned long gaddr, unsigned long vmaddr)
> +{
> +	int i = 0;
> +	pmd_t *pmdp;
> +
> +	pmdp = gmap_pmd_op_walk(gmap, gaddr);
> +	if (!pmdp)
> +		return;
> +
> +	if (pmd_large(*pmdp)) {
> +		if (gmap_test_and_clear_dirty_segment(gmap, pmdp, gaddr))
> +			memset(bitmap, 0xff, 32);

memset on bitmaps where your architecture is BE but the bitmap is LE.
This looks dangerous. It might work but it is definitely ugly from my
POV. If you pass in a temporary bitmap as I suggested, then you could at
least use bitmap_set() here.

> +	} else {
> +		for (; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
> +			if (test_and_clear_guest_dirty(gmap->mm, vmaddr))

Does this interface still make sense? I mean we already walked almost
all page tables, wouldn't it be easier to have a
gmap_test_and_clear_dirty() now instead? So we can avoid walking the
page tables again down to the PGSTE. (can be an addon patch of course)

> +				set_bit_le(i, bitmap);
> +		}
> +	}
> +	gmap_pmd_op_end(gmap, pmdp);
> +}
> +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
> +
>  static inline void thp_split_mm(struct mm_struct *mm)
>  {
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
> index 7bdb15fc5487..c393a6b0f362 100644
> --- a/arch/s390/mm/pgtable.c
> +++ b/arch/s390/mm/pgtable.c
> @@ -731,12 +731,6 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
>  	pmd = pmd_alloc(mm, pud, addr);
>  	if (!pmd)
>  		return false;
> -	/* We can't run guests backed by huge pages, but userspace can
> -	 * still set them up and then try to migrate them without any
> -	 * migration support.
> -	 */
> -	if (pmd_large(*pmd))
> -		return true;
>  
>  	ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl);
>  	if (unlikely(!ptep))
> 

-- 

Thanks,

David / dhildenb