[PATCH] support tmpfs hugepage PMD is not split when COW

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Transparent hugepages in tmpfs can enhance TLB efficiency by reducing
TLB misses. However, during Copy-On-Write (COW) memory faults, these
hugepages may be split. In some scenarios, preventing this splitting
is desirable. We might introduce a shmem_huge_fault to inhibit this
behavior, along with a mount parameter to enable or disable this function.

Signed-off-by: Chen Haixiang <chenhaixiang3@xxxxxxxxxx>
---
 include/linux/mm.h       |  1 +
 include/linux/shmem_fs.h |  1 +
 mm/memory.c              |  7 ++++
 mm/shmem.c               | 85 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 94 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index da5219b48d52..eb44574965d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -573,6 +573,7 @@ struct vm_operations_struct {
 			unsigned long end, unsigned long newflags);
 	vm_fault_t (*fault)(struct vm_fault *vmf);
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
+	vm_fault_t (*shmem_huge_fault)(struct vm_fault *vmf, pmd_t orig_pmd);
 	vm_fault_t (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 	unsigned long (*pagesize)(struct vm_area_struct * area);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 2caa6b86106a..4484f2f33afe 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -73,6 +73,7 @@ struct shmem_sb_info {
 	struct list_head shrinklist;  /* List of shinkable inodes */
 	unsigned long shrinklist_len; /* Length of shrinklist */
 	struct shmem_quota_limits qlimits; /* Default quota limits */
+	unsigned int no_split;  /* Do not split shmempmdmaped in tmpfs */
 };
 
 static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
diff --git a/mm/memory.c b/mm/memory.c
index 5c757fba8858..7d27a6b5e69f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4942,6 +4942,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 		}
 	}
 
+	if (vmf->vma->vm_ops->shmem_huge_fault) {
+		vm_fault_t ret = vmf->vma->vm_ops->shmem_huge_fault(vmf, vmf->orig_pmd);
+
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
+	}
+
 split:
 	/* COW or write-notify handled on pte level: split pmd. */
 	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
diff --git a/mm/shmem.c b/mm/shmem.c
index 0d1ce70bce38..8211211f7405 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -118,6 +118,7 @@ struct shmem_options {
 	umode_t mode;
 	bool full_inums;
 	int huge;
+	unsigned int no_split;
 	int seen;
 	bool noswap;
 	unsigned short quota_types;
@@ -128,6 +129,7 @@ struct shmem_options {
 #define SHMEM_SEEN_INUMS 8
 #define SHMEM_SEEN_NOSWAP 16
 #define SHMEM_SEEN_QUOTA 32
+#define SHMEM_SEEN_NO_SPLIT 64
 };
 
 #ifdef CONFIG_TMPFS
@@ -2238,6 +2240,79 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
 	return ret;
 }
 
+static vm_fault_t shmem_huge_fault(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+	vm_fault_t ret = VM_FAULT_FALLBACK;
+	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	struct folio *old_folio, *new_folio;
+	pmd_t entry;
+	int gfp_flags = GFP_HIGHUSER_MOVABLE | __GFP_COMP;
+	struct vm_area_struct *vma = vmf->vma;
+	struct shmem_sb_info *sbinfo = NULL;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	sbinfo = SHMEM_SB(info->vfs_inode.i_sb);
+
+	if (sbinfo->no_split == 0)
+		return VM_FAULT_FALLBACK;
+
+	/* ShmemPmdMapped in tmpfs will not split huge pmd */
+	if (!(vmf->flags & FAULT_FLAG_WRITE)
+			|| (vma->vm_flags & VM_SHARED))
+		return VM_FAULT_FALLBACK;
+
+	new_folio = vma_alloc_folio(gfp_flags, HPAGE_PMD_ORDER,
+			vmf->vma, haddr, true);
+	if (!new_folio)
+		ret = VM_FAULT_FALLBACK;
+
+	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+	if (pmd_none(*vmf->pmd)) {
+		ret = VM_FAULT_FALLBACK;
+		goto out;
+	}
+	if (!pmd_same(*vmf->pmd, orig_pmd)) {
+		ret = 0;
+		goto out;
+	}
+
+	if (!new_folio) {
+		count_vm_event(THP_FAULT_FALLBACK);
+		ret = VM_FAULT_FALLBACK;
+		goto out;
+	}
+	old_folio = page_folio(pmd_page(*vmf->pmd));
+	page_remove_rmap(&old_folio->page, vma, true);
+	pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+
+	__folio_set_locked(new_folio);
+	__folio_set_swapbacked(new_folio);
+	__folio_mark_uptodate(new_folio);
+
+	flush_icache_pages(vma, &new_folio->page, HPAGE_PMD_NR);
+	entry = mk_huge_pmd(&new_folio->page, vma->vm_page_prot);
+	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+
+	page_add_file_rmap(&new_folio->page, vma, true);
+	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
+	count_vm_event(THP_FILE_MAPPED);
+
+	folio_unlock(new_folio);
+	spin_unlock(vmf->ptl);
+	copy_user_large_folio(new_folio, old_folio, haddr, vma);
+	folio_put(old_folio);
+	ret = 0;
+	return ret;
+
+out:
+	if (new_folio)
+		folio_put(new_folio);
+	spin_unlock(vmf->ptl);
+	return ret;
+}
+
 unsigned long shmem_get_unmapped_area(struct file *file,
 				      unsigned long uaddr, unsigned long len,
 				      unsigned long pgoff, unsigned long flags)
@@ -3869,6 +3944,7 @@ enum shmem_param {
 	Opt_usrquota_inode_hardlimit,
 	Opt_grpquota_block_hardlimit,
 	Opt_grpquota_inode_hardlimit,
+	Opt_no_split,
 };
 
 static const struct constant_table shmem_param_enums_huge[] = {
@@ -3900,6 +3976,7 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
 	fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
 	fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
 #endif
+	fsparam_u32   ("no_split",	Opt_no_split),
 	{}
 };
 
@@ -4065,6 +4142,10 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 				       "Group quota inode hardlimit too large.");
 		ctx->qlimits.grpquota_ihardlimit = size;
 		break;
+	case Opt_no_split:
+		ctx->no_split = result.uint_32;
+		ctx->seen |= SHMEM_SEEN_NO_SPLIT;
+		break;
 	}
 	return 0;
 
@@ -4261,6 +4342,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
 	if (sbinfo->huge)
 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
+	if (sbinfo->huge && sbinfo->no_split)
+		seq_puts(seq, ",no_split");
 #endif
 	mpol = shmem_get_sbmpol(sbinfo);
 	shmem_show_mpol(seq, mpol);
@@ -4315,6 +4398,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 		if (!(ctx->seen & SHMEM_SEEN_INUMS))
 			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
 		sbinfo->noswap = ctx->noswap;
+		sbinfo->no_split = ctx->no_split;
 	} else {
 		sb->s_flags |= SB_NOUSER;
 	}
@@ -4568,6 +4652,7 @@ static const struct super_operations shmem_ops = {
 static const struct vm_operations_struct shmem_vm_ops = {
 	.fault		= shmem_fault,
 	.map_pages	= filemap_map_pages,
+	.shmem_huge_fault	= shmem_huge_fault,
 #ifdef CONFIG_NUMA
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
-- 
2.33.0





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux