On Mon, 2020-01-13 at 22:56 +0800, Yan, Zheng wrote: > On 1/13/20 9:26 PM, Jeff Layton wrote: > > On Mon, 2020-01-13 at 11:51 +0800, Yan, Zheng wrote: > > > On 1/11/20 4:56 AM, Jeff Layton wrote: > > > > It doesn't do much good to do an asynchronous create unless we can do > > > > I/O to it before the create reply comes in. That means we need layout > > > > info the new file before we've gotten the response from the MDS. > > > > > > > > All files created in a directory will initially inherit the same layout, > > > > so copy off the requisite info from the first synchronous create in the > > > > directory. Save it in the same fields in the directory inode, as those > > > > are otherwise unsed for dir inodes. This means we need to be a bit > > > > careful about only updating layout info on non-dir inodes. > > > > > > > > Also, zero out the layout when we drop Dc caps in the dir. > > > > > > > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> > > > > --- > > > > fs/ceph/caps.c | 24 ++++++++++++++++++++---- > > > > fs/ceph/file.c | 24 +++++++++++++++++++++++- > > > > fs/ceph/inode.c | 4 ++-- > > > > 3 files changed, 45 insertions(+), 7 deletions(-) > > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > > index 7fc87b693ba4..b96fb1378479 100644 > > > > --- a/fs/ceph/caps.c > > > > +++ b/fs/ceph/caps.c > > > > @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want, > > > > return ret; > > > > } > > > > > > > > - if (S_ISREG(ci->vfs_inode.i_mode) && > > > > + if (!S_ISDIR(ci->vfs_inode.i_mode) && > > > > ci->i_inline_version != CEPH_INLINE_NONE && > > > > (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && > > > > i_size_read(inode) > 0) { > > > > @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) > > > > if (had & CEPH_CAP_FILE_RD) > > > > if (--ci->i_rd_ref == 0) > > > > last++; > > > > - if (had & CEPH_CAP_FILE_CACHE) > > > > - if (--ci->i_rdcache_ref == 0) > > > > + if (had & CEPH_CAP_FILE_CACHE) { > > > > + if (--ci->i_rdcache_ref == 0) { > > > > last++; > > > > + /* Zero out layout if we lost CREATE caps */ > > > > + if (S_ISDIR(inode->i_mode) && > > > > + !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) { > > > > + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); > > > > + memset(&ci->i_layout, 0, sizeof(ci->i_layout)); > > > > + } > > > > + } > > > > + } > > > > > > should do this in __check_cap_issue > > > > > > > That function doesn't get called from the put codepath. Suppose one task > > is setting up an async create and a Dc (DIR_CREATE) cap revoke races in. > > We zero out the layout and then the inode has a bogus layout set in it. > > > > We can't wipe the cached layout until all of the Dc references are put. > > how about: > > get_caps_for_async_create() return the layout. > pass the returned layout into ceph_finish_async_open() > That still sounds racy. What guarantees the stability of the cached layout while we're copying it in get_caps_for_async_create()? I guess we could protect the new cached layout field with the i_ceph_lock. Is that a real improvement? I'm not sure. > > > > > > if (had & CEPH_CAP_FILE_EXCL) > > > > if (--ci->i_fx_ref == 0) > > > > last++; > > > > @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode, > > > > ci->i_subdirs = extra_info->nsubdirs; > > > > } > > > > > > > > - if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { > > > > + if (!S_ISDIR(inode->i_mode) && > > > > + (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { > > > > /* file layout may have changed */ > > > > s64 old_pool = ci->i_layout.pool_id; > > > > struct ceph_string *old_ns; > > > > @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode, > > > > ceph_cap_string(cap->issued), > > > > ceph_cap_string(newcaps), > > > > ceph_cap_string(revoking)); > > > > + > > > > + if (S_ISDIR(inode->i_mode) && > > > > + (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) { > > > > + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); > > > > + memset(&ci->i_layout, 0, sizeof(ci->i_layout)); > > > > + } > > > > > > same here > > > > > > > + > > > > if (S_ISREG(inode->i_mode) && > > > > (revoking & used & CEPH_CAP_FILE_BUFFER)) > > > > writeback = true; /* initiate writeback; will delay ack */ > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > > > index 1e6cdf2dfe90..d4d7a277faf1 100644 > > > > --- a/fs/ceph/file.c > > > > +++ b/fs/ceph/file.c > > > > @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file) > > > > return err; > > > > } > > > > > > > > +/* Clone the layout from a synchronous create, if the dir now has Dc caps */ > > > > +static void > > > > +copy_file_layout(struct inode *dst, struct inode *src) > > > > +{ > > > > + struct ceph_inode_info *cdst = ceph_inode(dst); > > > > + struct ceph_inode_info *csrc = ceph_inode(src); > > > > + > > > > + spin_lock(&cdst->i_ceph_lock); > > > > + if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && > > > > + !ceph_file_layout_is_valid(&cdst->i_layout)) { > > > > + memcpy(&cdst->i_layout, &csrc->i_layout, > > > > + sizeof(cdst->i_layout)); > > > > + rcu_assign_pointer(cdst->i_layout.pool_ns, > > > > + ceph_try_get_string(csrc->i_layout.pool_ns)); > > > > + cdst->i_max_size = csrc->i_max_size; > > > > + cdst->i_truncate_size = csrc->i_truncate_size; > > > > + } > > > > + spin_unlock(&cdst->i_ceph_lock); > > > > +} > > > > > > > > /* > > > > * Do a lookup + open with a single request. If we get a non-existent > > > > @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > > } else { > > > > dout("atomic_open finish_open on dn %p\n", dn); > > > > if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { > > > > - ceph_init_inode_acls(d_inode(dentry), &as_ctx); > > > > + struct inode *newino = d_inode(dentry); > > > > + > > > > + copy_file_layout(dir, newino); > > > > + ceph_init_inode_acls(newino, &as_ctx); > > > > file->f_mode |= FMODE_CREATED; > > > > } > > > > err = finish_open(file, dentry, ceph_open); > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > > > > index 9cfc093fd273..8b51051b79b0 100644 > > > > --- a/fs/ceph/inode.c > > > > +++ b/fs/ceph/inode.c > > > > @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, > > > > ci->i_subdirs = le64_to_cpu(info->subdirs); > > > > } > > > > > > > > - if (new_version || > > > > - (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { > > > > + if (!S_ISDIR(inode->i_mode) && (new_version || > > > > + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) { > > > > s64 old_pool = ci->i_layout.pool_id; > > > > struct ceph_string *old_ns; > > > > > > > > -- Jeff Layton <jlayton@xxxxxxxxxx>