Expand ZONE_NVM into enum zone_type, and create GFP_NVM which represents gfp_t flag for NVM zone. Because there is no lower plain integer GFP bitmask can be used for ___GFP_NVM, a workable way is to get space from GFP_ZONE_BAD to fill ZONE_NVM into GFP_ZONE_TABLE. Signed-off-by: Huaisheng Ye <yehs1@xxxxxxxxxx> Signed-off-by: Ocean He <hehy1@xxxxxxxxxx> --- include/linux/gfp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/mmzone.h | 3 +++ mm/Kconfig | 16 ++++++++++++++ mm/page_alloc.c | 3 +++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 1a4582b..9e4d867 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,6 +39,9 @@ #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u +#ifdef CONFIG_ZONE_NVM +#define ___GFP_NVM 0x4000000u +#endif #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x2000000u #else @@ -57,7 +60,12 @@ #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ +#ifdef CONFIG_ZONE_NVM +#define __GFP_NVM ((__force gfp_t)___GFP_NVM) /* ZONE_NVM allowed */ +#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE|__GFP_NVM) +#else #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) +#endif /* * Page mobility and placement hints @@ -205,7 +213,8 @@ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP) + \ + (IS_ENABLED(CONFIG_ZONE_NVM) << 1)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* @@ -283,6 +292,9 @@ #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) +#ifdef CONFIG_ZONE_NVM +#define GFP_NVM __GFP_NVM +#endif /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -342,7 +354,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL - * 0x3 => BAD (DMA+HIGHMEM) + * 0x3 => NVM (DMA+HIGHMEM), now it is used by NVDIMM zone * 0x4 => DMA32 or DMA or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) @@ -370,6 +382,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif +#ifdef CONFIG_ZONE_NVM +#define ___GFP_NVM_BIT (___GFP_DMA | ___GFP_HIGHMEM) +#define GFP_ZONE_TABLE ( \ + ((__force unsigned long)ZONE_NORMAL << \ + 0 * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)OPT_ZONE_DMA << \ + ___GFP_DMA * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)OPT_ZONE_HIGHMEM << \ + ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)OPT_ZONE_DMA32 << \ + ___GFP_DMA32 * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)ZONE_NORMAL << \ + ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)OPT_ZONE_DMA << \ + (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)ZONE_MOVABLE << \ + (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)OPT_ZONE_DMA32 << \ + (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT) \ + | ((__force unsigned long)ZONE_NVM << \ + ___GFP_NVM_BIT * GFP_ZONES_SHIFT) \ +) +#else #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ @@ -380,6 +415,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) +#endif /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 @@ -387,6 +423,17 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * entry starting with bit 0. Bit is set if the combination is not * allowed. */ +#ifdef CONFIG_ZONE_NVM +#define GFP_ZONE_BAD ( \ + 1 << (___GFP_DMA | ___GFP_DMA32) \ + | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ + | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ + | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ + | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ + | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ + | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ +) +#else #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ @@ -397,12 +444,16 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) +#endif static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); - +#ifdef CONFIG_ZONE_NVM + if (bit & __GFP_NVM) + bit = (__force int)___GFP_NVM_BIT; +#endif z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7522a69..f38e4a0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -345,6 +345,9 @@ enum zone_type { */ ZONE_HIGHMEM, #endif +#ifdef CONFIG_ZONE_NVM + ZONE_NVM, +#endif ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, diff --git a/mm/Kconfig b/mm/Kconfig index c782e8f..5fe1f63 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -687,6 +687,22 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ZONE_NVM + bool "Manage NVDIMM (pmem) by memory management (EXPERIMENTAL)" + depends on NUMA && X86_64 + depends on HAVE_MEMBLOCK_NODE_MAP + depends on HAVE_MEMBLOCK + depends on !IA32_EMULATION + default n + + help + This option allows you to use memory management subsystem to manage + NVDIMM (pmem). With it mm can arrange NVDIMMs into real physical zones + like NORMAL and DMA32. That means buddy system and swap can be used + directly to NVDIMM zone. This feature is beneficial to recover + dirty pages from power fail or system crash by storing write cache + to NVDIMM zone. + config ARCH_HAS_HMM bool default y diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 266c065..d8bd20d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -228,6 +228,9 @@ bool pm_suspended_storage(void) "DMA32", #endif "Normal", +#ifdef CONFIG_ZONE_NVM + "NVM", +#endif #ifdef CONFIG_HIGHMEM "HighMem", #endif -- 1.8.3.1