On 8/30/20 7:33 PM, Amir Goldstein wrote:
On Sat, Aug 29, 2020 at 12:51 PM Chengguang Xu <cgxu519@xxxxxxxxxxxx> wrote:
Implement stacked mmap for shared map to keep data
consistency.
Signed-off-by: Chengguang Xu <cgxu519@xxxxxxxxxxxx>
---
fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 114 insertions(+), 6 deletions(-)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 14ab5344a918..db5ab200d984 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -21,9 +21,17 @@ struct ovl_aio_req {
struct fd fd;
};
+static vm_fault_t ovl_fault(struct vm_fault *vmf);
+static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
+
+static const struct vm_operations_struct ovl_vm_ops = {
+ .fault = ovl_fault,
+ .page_mkwrite = ovl_page_mkwrite,
+};
+
Interesting direction, not sure if this is workable.
I don't know enough about mm to say.
But what about the rest of the operations?
Did you go over them and decide that overlay doesn't need to implement them?
I doubt it, but if you did, please document that.
I did some check for rest of them, IIUC ->fault will be enough for this
special case (shared read-only mmap with no upper), I will remove
->page_mkwrite in v2.
# I do not consider support ->huge_fault in current stage due to many fs
cannot support DAX properly.
BTW, do you know who should I add to CC list for further deep review of
this code? fadevel-list?
struct ovl_file_entry {
struct file *realfile;
- void *vm_ops;
+ const struct vm_operations_struct *vm_ops;
};
struct file *ovl_get_realfile(struct file *file)
@@ -40,14 +48,15 @@ void ovl_set_realfile(struct file *file, struct file *realfile)
ofe->realfile = realfile;
}
-void *ovl_get_real_vmops(struct file *file)
+const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
{
struct ovl_file_entry *ofe = file->private_data;
return ofe->vm_ops;
}
-void ovl_set_real_vmops(struct file *file, void *vm_ops)
+void ovl_set_real_vmops(struct file *file,
+ const struct vm_operations_struct *vm_ops)
{
struct ovl_file_entry *ofe = file->private_data;
@@ -493,11 +502,104 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
}
+vm_fault_t ovl_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
+ struct file *realfile;
+ struct file *fpin, *tmp;
+ struct inode *inode = file_inode(file);
+ struct inode *realinode;
+ const struct cred *old_cred;
+ bool retry_allowed;
+ vm_fault_t ret;
+ int err = 0;
+
+ if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
+ realfile = ovl_get_realfile(file);
+
+ if (!ovl_has_upperdata(inode) ||
+ realfile->f_inode != ovl_inode_upper(inode) ||
+ !realfile->f_op->mmap)
+ return VM_FAULT_SIGBUS;
+
+ if (!ovl_get_real_vmops(file)) {
+ old_cred = ovl_override_creds(inode->i_sb);
+ err = call_mmap(realfile, vma);
+ revert_creds(old_cred);
+
+ vma->vm_file = file;
+ if (err) {
+ vma->vm_ops = &ovl_vm_ops;
+ return VM_FAULT_SIGBUS;
+ }
+ ovl_set_real_vmops(file, vma->vm_ops);
+ vma->vm_ops = &ovl_vm_ops;
+ }
+
+ retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
+ if (retry_allowed)
+ vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ vma->vm_file = realfile;
+ ret = ovl_get_real_vmops(file)->fault(vmf);
+ vma->vm_file = file;
+ if (retry_allowed)
+ vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
+ return ret;
+
+ } else {
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ if (!fpin)
+ return VM_FAULT_SIGBUS;
+
+ ret = VM_FAULT_RETRY;
+ if (!ovl_has_upperdata(inode)) {
+ err = ovl_copy_up_with_data(file->f_path.dentry);
+ if (err)
+ goto out;
+ }
+
+ realinode = ovl_inode_realdata(inode);
+ realfile = ovl_open_realfile(file, realinode);
+ if (IS_ERR(realfile))
+ goto out;
+
+ tmp = ovl_get_realfile(file);
+ ovl_set_realfile(file, realfile);
+ fput(tmp);
+
+out:
+ fput(fpin);
+ return ret;
+ }
+}
Please add some documentation to explain the method used.
Do we need to retry if real_vmops are already set?
Good catch, actually retry is not needed in that case.
Basically, we unlock(mmap_lock)->copy-up->open when
detecting no upper inode then retry fault operation.
However, we need to check fault retry flag carefully
for avoiding endless retry.
I'll add more explanation in v2.
---
cgxu