This patch adds basic page table sharing across tasks by making
mshare syscall. It does this by creating a new mm_struct which
hosts the shared vmas and page tables. This mm_struct is
maintained as long as there is at least one task using the mshare'd
range. It is cleaned up by the last mshare_unlink syscall.
Signed-off-by: Khalid Aziz <khalid.aziz@xxxxxxxxxx>
Signed-off-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx>
---
mm/internal.h | 2 +
mm/memory.c | 35 ++++++++++
mm/mshare.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 214 insertions(+), 13 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index cf50a471384e..68f82f0f8b66 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
+extern vm_fault_t find_shared_vma(struct vm_area_struct **vma,
+ unsigned long *addrp);
static inline bool vma_is_shared(const struct vm_area_struct *vma)
{
return vma->vm_flags & VM_SHARED_PT;
diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913..c77c0d643ea8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
+ bool shared = false;
__set_current_state(TASK_RUNNING);
@@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
+ if (unlikely(vma_is_shared(vma))) {
+ ret = find_shared_vma(&vma, &address);
+ if (ret)
+ return ret;
+ if (!vma)
+ return VM_FAULT_SIGSEGV;
+ shared = true;
+ }
+
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
@@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
else
ret = __handle_mm_fault(vma, address, flags);
+ /*
+ * Release the read lock on shared VMA's parent mm unless
+ * __handle_mm_fault released the lock already.
+ * __handle_mm_fault sets VM_FAULT_RETRY in return value if
+ * it released mmap lock. If lock was released, that implies
+ * the lock would have been released on task's original mm if
+ * this were not a shared PTE vma. To keep lock state consistent,
+ * make sure to release the lock on task's original mm
+ */
+ if (shared) {
+ int release_mmlock = 1;
+
+ if (!(ret & VM_FAULT_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ release_mmlock = 0;
+ } else if ((flags & FAULT_FLAG_ALLOW_RETRY) &&
+ (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ mmap_read_unlock(vma->vm_mm);
+ release_mmlock = 0;
+ }
+
+ if (release_mmlock)
+ mmap_read_unlock(current->mm);
+ }
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
diff --git a/mm/mshare.c b/mm/mshare.c
index cd2f7ad24d9d..d1896adcb00f 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -17,18 +17,49 @@
#include <linux/pseudo_fs.h>
#include <linux/fileattr.h>
#include <linux/refcount.h>
+#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <uapi/linux/magic.h>
#include <uapi/linux/limits.h>
struct mshare_data {
- struct mm_struct *mm;
+ struct mm_struct *mm, *host_mm;
mode_t mode;
refcount_t refcnt;
};
static struct super_block *msharefs_sb;
+/* Returns holding the host mm's lock for read. Caller must release. */
+vm_fault_t
+find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp)
+{
+ struct vm_area_struct *vma, *guest = *vmap;
+ struct mshare_data *info = guest->vm_private_data;
+ struct mm_struct *host_mm = info->mm;
+ unsigned long host_addr;
+ pgd_t *pgd, *guest_pgd;
+
+ host_addr = *addrp - guest->vm_start + host_mm->mmap_base;
+ pgd = pgd_offset(host_mm, host_addr);
+ guest_pgd = pgd_offset(current->mm, *addrp);
+ if (!pgd_same(*guest_pgd, *pgd)) {
+ set_pgd(guest_pgd, *pgd);
+ return VM_FAULT_NOPAGE;
+ }
+
+ *addrp = host_addr;
+ mmap_read_lock(host_mm);
+ vma = find_vma(host_mm, host_addr);
+
+ /* XXX: expand stack? */
+ if (vma && vma->vm_start > host_addr)
+ vma = NULL;
+
+ *vmap = vma;
+ return 0;
+}
+
static void
msharefs_evict_inode(struct inode *inode)
{
@@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
unsigned long, len, int, oflag, mode_t, mode)
{
struct mshare_data *info;
- struct mm_struct *mm;
struct filename *fname = getname(name);
struct dentry *dentry;
struct inode *inode;
struct qstr namestr;
+ struct vm_area_struct *vma, *next, *new_vma;
+ struct mm_struct *new_mm;
+ unsigned long end;
int err = PTR_ERR(fname);
/*
@@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
if (IS_ERR(fname))
goto err_out;
+ end = addr + len;
+
/*
* Does this mshare entry exist already? If it does, calling
* mshare with O_EXCL|O_CREAT is an error
@@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
inode_lock(d_inode(msharefs_sb->s_root));
dentry = d_lookup(msharefs_sb->s_root, &namestr);
if (dentry && (oflag & (O_EXCL|O_CREAT))) {
+ inode = d_inode(dentry);
err = -EEXIST;
dput(dentry);
goto err_unlock_inode;
}
if (dentry) {
+ unsigned long mapaddr, prot = PROT_NONE;
+
inode = d_inode(dentry);
if (inode == NULL) {
+ mmap_write_unlock(current->mm);
err = -EINVAL;
goto err_out;
}
info = inode->i_private;
- refcount_inc(&info->refcnt);
dput(dentry);
+
+ /*
+ * Map in the address range as anonymous mappings
+ */
+ oflag &= (O_RDONLY | O_WRONLY | O_RDWR);
+ if (oflag & O_RDONLY)
+ prot |= PROT_READ;
+ else if (oflag & O_WRONLY)
+ prot |= PROT_WRITE;
+ else if (oflag & O_RDWR)
+ prot |= (PROT_READ | PROT_WRITE);
+ mapaddr = vm_mmap(NULL, addr, len, prot,
+ MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0);
+ if (IS_ERR((void *)mapaddr)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ refcount_inc(&info->refcnt);
+
+ /*
+ * Now that we have mmap'd the mshare'd range, update vma
+ * flags and vm_mm pointer for this mshare'd range.
+ */
+ mmap_write_lock(current->mm);
+ vma = find_vma(current->mm, addr);
+ if (vma && vma->vm_start < addr) {