Re: [RFC PATCH 3/3] ovl: implement stacked mmap for shared map

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/31/20 11:51 PM, Amir Goldstein wrote:
On Mon, Aug 31, 2020 at 4:47 PM cgxu <cgxu519@xxxxxxxxxxxx> wrote:

On 8/30/20 7:33 PM, Amir Goldstein wrote:
On Sat, Aug 29, 2020 at 12:51 PM Chengguang Xu <cgxu519@xxxxxxxxxxxx> wrote:

Implement stacked mmap for shared map to keep data
consistency.

Signed-off-by: Chengguang Xu <cgxu519@xxxxxxxxxxxx>
---
   fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
   1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 14ab5344a918..db5ab200d984 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -21,9 +21,17 @@ struct ovl_aio_req {
          struct fd fd;
   };

+static vm_fault_t ovl_fault(struct vm_fault *vmf);
+static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
+
+static const struct vm_operations_struct ovl_vm_ops = {
+       .fault          = ovl_fault,
+       .page_mkwrite   = ovl_page_mkwrite,
+};
+

Interesting direction, not sure if this is workable.
I don't know enough about mm to say.

But what about the rest of the operations?
Did you go over them and decide that overlay doesn't need to implement them?
I doubt it, but if you did, please document that.

I did some check for rest of them, IIUC ->fault will be enough for this
special case (shared read-only mmap with no upper), I will remove
->page_mkwrite in v2.

Ok I suppose you checked that ->map_pages is not relevant?


->map_pages() does easy maps and fallback to ->fault if the offset is not ready, so I think without ->map_pages() it still could work properly, we can also implement it for acceleration.




# I do not consider support ->huge_fault in current stage due to many fs
cannot support DAX properly.

BTW, do you know who should I add to CC list for further deep review of
this code? fadevel-list?


fsdevel would be good, but I would wait for initial feedback from Miklos
before you post v2...




   struct ovl_file_entry {
          struct file *realfile;
-       void *vm_ops;
+       const struct vm_operations_struct *vm_ops;
   };

   struct file *ovl_get_realfile(struct file *file)
@@ -40,14 +48,15 @@ void ovl_set_realfile(struct file *file, struct file *realfile)
          ofe->realfile = realfile;
   }

-void *ovl_get_real_vmops(struct file *file)
+const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
   {
          struct ovl_file_entry *ofe = file->private_data;

          return ofe->vm_ops;
   }

-void ovl_set_real_vmops(struct file *file, void *vm_ops)
+void ovl_set_real_vmops(struct file *file,
+                       const struct vm_operations_struct *vm_ops)
   {
          struct ovl_file_entry *ofe = file->private_data;

@@ -493,11 +502,104 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
          return ret;
   }

+vm_fault_t ovl_fault(struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct file *file = vma->vm_file;
+       struct file *realfile;
+       struct file *fpin, *tmp;
+       struct inode *inode = file_inode(file);
+       struct inode *realinode;
+       const struct cred *old_cred;
+       bool retry_allowed;
+       vm_fault_t ret;
+       int err = 0;
+
+       if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
+               realfile = ovl_get_realfile(file);
+
+               if (!ovl_has_upperdata(inode) ||
+                   realfile->f_inode != ovl_inode_upper(inode) ||
+                   !realfile->f_op->mmap)
+                       return VM_FAULT_SIGBUS;
+
+               if (!ovl_get_real_vmops(file)) {
+                       old_cred = ovl_override_creds(inode->i_sb);
+                       err = call_mmap(realfile, vma);
+                       revert_creds(old_cred);
+
+                       vma->vm_file = file;
+                       if (err) {
+                               vma->vm_ops = &ovl_vm_ops;
+                               return VM_FAULT_SIGBUS;
+                       }
+                       ovl_set_real_vmops(file, vma->vm_ops);
+                       vma->vm_ops = &ovl_vm_ops;
+               }
+
+               retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
+               if (retry_allowed)
+                       vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+               vma->vm_file = realfile;
+               ret = ovl_get_real_vmops(file)->fault(vmf);
+               vma->vm_file = file;
+               if (retry_allowed)
+                       vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
+               return ret;
+
+       } else {
+               fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+               if (!fpin)
+                       return VM_FAULT_SIGBUS;
+
+               ret = VM_FAULT_RETRY;
+               if (!ovl_has_upperdata(inode)) {
+                       err = ovl_copy_up_with_data(file->f_path.dentry);
+                       if (err)
+                               goto out;
+               }
+
+               realinode = ovl_inode_realdata(inode);
+               realfile = ovl_open_realfile(file, realinode);
+               if (IS_ERR(realfile))
+                       goto out;
+
+               tmp = ovl_get_realfile(file);
+               ovl_set_realfile(file, realfile);
+               fput(tmp);
+
+out:
+               fput(fpin);
+               return ret;
+       }
+}


Please add some documentation to explain the method used.
Do we need to retry if real_vmops are already set?


Good catch, actually retry is not needed in that case.

Basically, we unlock(mmap_lock)->copy-up->open when
detecting no upper inode then retry fault operation.
However, we need to check fault retry flag carefully
for avoiding endless retry.

That much I got, but the details of setting ->vm_file and vmops
look subtle, so better explain them.


I'll add some explanations in V2, but before that let me write some
comments based on code logic below. If there is still something not clear you can point out that again.



+	if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
+		realfile = ovl_get_realfile(file);
+
+		if (!ovl_has_upperdata(inode) ||
+		    realfile->f_inode != ovl_inode_upper(inode) ||
+		    !realfile->f_op->mmap)
+			return VM_FAULT_SIGBUS;

Above condition indicates (copy-up)/(open real-file) failed or
(real-file does not support mmap), so we have to return SIGBUS.



+
+		if (!ovl_get_real_vmops(file)) {
+			old_cred = ovl_override_creds(inode->i_sb);
+			err = call_mmap(realfile, vma);
+			revert_creds(old_cred);
+
+			vma->vm_file = file;
+			if (err) {
+				vma->vm_ops = &ovl_vm_ops;
+				return VM_FAULT_SIGBUS;
+			}
+			ovl_set_real_vmops(file, vma->vm_ops);
+			vma->vm_ops = &ovl_vm_ops;


call_mmap() will rewrite vma->vm_file and vma->vm_ops to upper layer's,
so here recover to overlay's in order to jump into this ovl_fault()
in other page-faults.


+		}
+
+		retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
+		if (retry_allowed)
+			vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;

here, we disallow retry in real ->fault because retry will unlock mmap_lock, touching vma after unlock is not safe behavior.


+		vma->vm_file = realfile;
+		ret = ovl_get_real_vmops(file)->fault(vmf);


calling real fault handler.


+		vma->vm_file = file;
+		if (retry_allowed)
+			vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;


recover vm_file and vm_ops to overlay's so that we can jump into
ovl_fault in other page-faults.


+		return ret;
+
+	} else {



Thanks,
cgxu






[Index of Archives]     [Linux Filesystems Devel]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux