Re: [PATCH] s390/mm: implement software dirty bits

Mel Gorman <mgorman@xxxxxxx> · Wed, 6 Feb 2013 11:21:11 +0000

On Tue, Feb 05, 2013 at 10:12:05AM -0800, Martin Schwidefsky wrote:
> The s390 architecture is unique in respect to dirty page detection,
> it uses the change bit in the per-page storage key to track page
> modifications. All other architectures track dirty bits by means
> of page table entries. This property of s390 has caused numerous
> problems in the past, e.g. see git commit ef5d437f71afdf4a
> "mm: fix XFS oops due to dirty pages without buffers on s390".
> 
> To avoid future issues in regard to per-page dirty bits convert
> s390 to a fault based software dirty bit detection mechanism. All
> user page table entries which are marked as clean will be hardware
> read-only, even if the pte is supposed to be writable. A write by
> the user process will trigger a protection fault which will cause
> the user pte to be marked as dirty and the hardware read-only bit
> is removed.
> 
> With this change the dirty bit in the storage key is irrelevant
> for Linux as a host, but the storage key is still required for
> KVM guests. The effect is that page_test_and_clear_dirty and the
> related code can be removed. The referenced bit in the storage
> key is still used by the page_test_and_clear_young primitive to
> provide page age information.
> 
> For page cache pages of mappings with mapping_cap_account_dirty
> there will not be any change in behavior as the dirty bit tracking
> already uses read-only ptes to control the amount of dirty pages.
> Only for swap cache pages and pages of mappings without
> mapping_cap_account_dirty there can be additional protection faults.
> To avoid an excessive number of additional faults the mk_pte
> primitive checks for PageDirty if the pgprot value allows for writes
> and pre-dirties the pte. That avoids all additional faults for
> tmpfs and shmem pages until these pages are added to the swap cache.
> 
> Signed-off-by: Martin Schwidefsky <schwidefsky@xxxxxxxxxx>

I have a few clarifications below just to make sure I'm reading it right
but I think it looks fine and the change to mm/rmap.c is welcome.

> diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
> index 098adbb..2b3d3b6 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -29,6 +29,7 @@
>  #ifndef __ASSEMBLY__
>  #include <linux/sched.h>
>  #include <linux/mm_types.h>
> +#include <linux/page-flags.h>
>  #include <asm/bug.h>
>  #include <asm/page.h>
>  
> @@ -221,13 +222,15 @@ extern unsigned long MODULES_END;
>  /* Software bits in the page table entry */
>  #define _PAGE_SWT	0x001		/* SW pte type bit t */
>  #define _PAGE_SWX	0x002		/* SW pte type bit x */
> -#define _PAGE_SWC	0x004		/* SW pte changed bit (for KVM) */
> -#define _PAGE_SWR	0x008		/* SW pte referenced bit (for KVM) */
> -#define _PAGE_SPECIAL	0x010		/* SW associated with special page */
> +#define _PAGE_SWC	0x004		/* SW pte changed bit */
> +#define _PAGE_SWR	0x008		/* SW pte referenced bit */
> +#define _PAGE_SWW	0x010		/* SW pte write bit */
> +#define _PAGE_SPECIAL	0x020		/* SW associated with special page */
>  #define __HAVE_ARCH_PTE_SPECIAL
>  
>  /* Set of bits not changed in pte_modify */
> -#define _PAGE_CHG_MASK	(PAGE_MASK | _PAGE_SPECIAL | _PAGE_SWC | _PAGE_SWR)
> +#define _PAGE_CHG_MASK		(PAGE_MASK | _PAGE_SPECIAL | _PAGE_CO | \
> +				 _PAGE_SWC | _PAGE_SWR)
>  

If I'm reading it right, the _PAGE_CO is bit is what allows you to force
the hardware to trap even if the PTE says the page is writable. This is
what necessitates the shuffling of _PAGE_SPECIAL so you have a software
write bit and a hardware write bit.

For existing distributions they might not be able to use this patch for
bugs like ""mm: fix XFS oops due to dirty pages without buffers on s390"
because this shuffling of bits will break KABIi but that's not your problem.

>  /* Six different types of pages. */
>  #define _PAGE_TYPE_EMPTY	0x400
> @@ -321,6 +324,7 @@ extern unsigned long MODULES_END;
>  
>  /* Bits in the region table entry */
>  #define _REGION_ENTRY_ORIGIN	~0xfffUL/* region/segment table origin	    */
> +#define _REGION_ENTRY_RO	0x200	/* region protection bit	    */
>  #define _REGION_ENTRY_INV	0x20	/* invalid region table entry	    */
>  #define _REGION_ENTRY_TYPE_MASK	0x0c	/* region/segment table type mask   */
>  #define _REGION_ENTRY_TYPE_R1	0x0c	/* region first table type	    */
> @@ -382,9 +386,10 @@ extern unsigned long MODULES_END;
>   */
>  #define PAGE_NONE	__pgprot(_PAGE_TYPE_NONE)
>  #define PAGE_RO		__pgprot(_PAGE_TYPE_RO)
> -#define PAGE_RW		__pgprot(_PAGE_TYPE_RW)
> +#define PAGE_RW		__pgprot(_PAGE_TYPE_RO | _PAGE_SWW)
> +#define PAGE_RWC	__pgprot(_PAGE_TYPE_RW | _PAGE_SWW | _PAGE_SWC)
>  
> -#define PAGE_KERNEL	PAGE_RW
> +#define PAGE_KERNEL	PAGE_RWC
>  #define PAGE_COPY	PAGE_RO
>  
>  /*

And this combination of page bits looks consistent. The details are
heavily buried in the arch code so I hope however deals with this area
in the future spots the changelog. Of course, that guy is likely to be
you anyway :)

> @@ -631,23 +636,23 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
>  	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
>  	/* Clear page changed & referenced bit in the storage key */
>  	if (bits & _PAGE_CHANGED)
> -		page_set_storage_key(address, skey ^ bits, 1);
> +		page_set_storage_key(address, skey ^ bits, 0);
>  	else if (bits)
>  		page_reset_referenced(address);
>  	/* Transfer page changed & referenced bit to guest bits in pgste */
>  	pgste_val(pgste) |= bits << 48;		/* RCP_GR_BIT & RCP_GC_BIT */
>  	/* Get host changed & referenced bits from pgste */
>  	bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52;
> -	/* Clear host bits in pgste. */
> +	/* Transfer page changed & referenced bit to kvm user bits */
> +	pgste_val(pgste) |= bits << 45;		/* KVM_UR_BIT & KVM_UC_BIT */
> +	/* Clear relevant host bits in pgste. */
>  	pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT);
>  	pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT);
>  	/* Copy page access key and fetch protection bit to pgste */
>  	pgste_val(pgste) |=
>  		(unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
> -	/* Transfer changed and referenced to kvm user bits */
> -	pgste_val(pgste) |= bits << 45;		/* KVM_UR_BIT & KVM_UC_BIT */
> -	/* Transfer changed & referenced to pte sofware bits */
> -	pte_val(*ptep) |= bits << 1;		/* _PAGE_SWR & _PAGE_SWC */
> +	/* Transfer referenced bit to pte */
> +	pte_val(*ptep) |= (bits & _PAGE_REFERENCED) << 1;
>  #endif
>  	return pgste;
>  
> @@ -660,20 +665,25 @@ static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
>  
>  	if (!pte_present(*ptep))
>  		return pgste;
> +	/* Get referenced bit from storage key */
>  	young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
> -	/* Transfer page referenced bit to pte software bit (host view) */
> -	if (young || (pgste_val(pgste) & RCP_HR_BIT))
> +	if (young)
> +		pgste_val(pgste) |= RCP_GR_BIT;
> +	/* Get host referenced bit from pgste */
> +	if (pgste_val(pgste) & RCP_HR_BIT) {
> +		pgste_val(pgste) &= ~RCP_HR_BIT;
> +		young = 1;
> +	}
> +	/* Transfer referenced bit to kvm user bits and pte */
> +	if (young) {
> +		pgste_val(pgste) |= KVM_UR_BIT;
>  		pte_val(*ptep) |= _PAGE_SWR;
> -	/* Clear host referenced bit in pgste. */
> -	pgste_val(pgste) &= ~RCP_HR_BIT;
> -	/* Transfer page referenced bit to guest bit in pgste */
> -	pgste_val(pgste) |= (unsigned long) young << 50; /* set RCP_GR_BIT */
> +	}
>  #endif
>  	return pgste;
> -
>  }
>  
> -static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
> +static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry)
>  {
>  #ifdef CONFIG_PGSTE
>  	unsigned long address;
> @@ -687,10 +697,23 @@ static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
>  	/* Set page access key and fetch protection bit from pgste */
>  	nkey |= (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56;
>  	if (okey != nkey)
> -		page_set_storage_key(address, nkey, 1);
> +		page_set_storage_key(address, nkey, 0);
>  #endif
>  }
>  
> +static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
> +{
> +	if (!MACHINE_HAS_ESOP && (pte_val(entry) & _PAGE_SWW)) {
> +		/*
> +		 * Without enhanced suppression-on-protection force
> +		 * the dirty bit on for all writable ptes.
> +		 */
> +		pte_val(entry) |= _PAGE_SWC;
> +		pte_val(entry) &= ~_PAGE_RO;
> +	}
> +	*ptep = entry;
> +}
> +
>  /**
>   * struct gmap_struct - guest address space
>   * @mm: pointer to the parent mm_struct

So establishing a writable PTE clears the hardware RO override as
well and the changed bit is set so it'll be considered dirty.

Reading down through the other page tables updates, I couldn't spot a place
where you were inconsistent with the handling of the hardware _PAGE_CO
and _PAGE_RO. Writes were allowed when the change bit was already set.
I couldn't follow it all, particularly around the KVM bits so take it with
a grain of salt but for me anyway;

Acked-by: Mel Gorman <mgorman@xxxxxxx>

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>