On Wed, 2019-07-24 at 20:21 +0800, Yan, Zheng wrote: > Make client use osd reply and session message to infer if itself is > blacklisted. Client reconnect to cluster using new entity addr if it > is blacklisted. Auto reconnect is limited to once every 30 minutes. > > Auto reconnect is controlled by recover_session=<clean|no> mount option. > So far only clean mode is supported and it is the default mode. In this > mode, client drops any dirty data/metadata, invalidates page caches and > invalidates all writable file handles. After reconnect, file locks become > stale because MDS lose track of them. If an inode contains any stale file > lock, read/write on the indoe are not allowed until all stale file locks > are released by applications. > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > --- > Documentation/filesystems/ceph.txt | 10 +++++++++ > fs/ceph/addr.c | 22 ++++++++++++++----- > fs/ceph/file.c | 8 ++++++- > fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++-- > fs/ceph/super.c | 17 +++++++++++++++ > fs/ceph/super.h | 4 ++++ > 6 files changed, 87 insertions(+), 8 deletions(-) > > diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt > index d2c6a5ccf0f5..215f83625a42 100644 > --- a/Documentation/filesystems/ceph.txt > +++ b/Documentation/filesystems/ceph.txt > @@ -158,6 +158,16 @@ Mount Options > copies. Currently, it's only used in copy_file_range, which will revert > to the default VFS implementation if this option is used. > > + recover_session=<no|clean> > + Set auto reconnect mode in the case of blacklisted. Auto reconnect > + is disabled when mode is 'no'. In 'clean' mode, client reconnect > + to ceph cluster automatically when it detects itself is blacklisted. > + During reconnect, client drops dirty data/metadata, invalidates page > + caches and writable file handles. After reconnect, file locks become > + stale because MDS lose track of them. If an inode contains any stale > + file lock, read/write on the indoe are not allowed until all stale file > + locks are released by applications. The default mode is 'no'. > + > More Information > ================ > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index 9f357c5ce84d..982bb8d7aa03 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) > { > struct inode *inode = file_inode(filp); > struct ceph_inode_info *ci = ceph_inode(inode); > - struct ceph_osd_client *osdc = > - &ceph_inode_to_client(inode)->client->osdc; > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > int err = 0; > u64 off = page_offset(page); > u64 len = PAGE_SIZE; > @@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) > > dout("readpage inode %p file %p page %p index %lu\n", > inode, filp, page, page->index); > - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, > - off, &len, > + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), > + &ci->i_layout, off, &len, > ci->i_truncate_seq, ci->i_truncate_size, > &page, 1, 0); > if (err == -ENOENT) > @@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) > if (err < 0) { > SetPageError(page); > ceph_fscache_readpage_cancel(inode, page); > + if (err == -EBLACKLISTED) > + fsc->blacklisted = 1; > goto out; > } > if (err < PAGE_SIZE) > @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) > int i; > > dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); > + if (rc == -EBLACKLISTED) > + ceph_inode_to_client(inode)->blacklisted = 1; > > /* unlock all pages, zeroing any data we didn't read */ > osd_data = osd_req_op_extent_osd_data(req, 0); > @@ -641,6 +644,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) > end_page_writeback(page); > return err; > } > + if (err == -EBLACKLISTED) > + fsc->blacklisted = 1; > dout("writepage setting page/mapping error %d %p\n", > err, page); > SetPageError(page); > @@ -721,6 +726,8 @@ static void writepages_finish(struct ceph_osd_request *req) > if (rc < 0) { > mapping_set_error(mapping, rc); > ceph_set_error_write(ci); > + if (rc == -EBLACKLISTED) > + fsc->blacklisted = 1; > } else { > ceph_clear_error_write(ci); > } > @@ -1947,12 +1954,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, > > if (err >= 0 || err == -ENOENT) > have |= POOL_READ; > - else if (err != -EPERM) > + else if (err != -EPERM) { > + if (err == -EBLACKLISTED) > + fsc->blacklisted = 1; > goto out_unlock; > + } > > if (err2 == 0 || err2 == -EEXIST) > have |= POOL_WRITE; > else if (err2 != -EPERM) { > + if (err2 == -EBLACKLISTED) > + fsc->blacklisted = 1; > err = err2; > goto out_unlock; > } > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 42cb1453c602..856a8f8e4981 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -698,7 +698,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > ceph_release_page_vector(pages, num_pages); > } > > - if (ret <= 0 || off >= i_size || !more) > + if (ret < 0) { > + if (ret == -EBLACKLISTED) > + fsc->blacklisted = 1; > + break; > + } > + > + if (off >= i_size || !more) > break; > } > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index c49009965369..4659da732c77 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -3032,18 +3032,23 @@ static void handle_forward(struct ceph_mds_client *mdsc, > pr_err("mdsc_handle_forward decode error err=%d\n", err); > } > > -static int __decode_and_drop_session_metadata(void **p, void *end) > +static int __decode_session_metadata(void **p, void *end, > + bool *blacklisted) > { > /* map<string,string> */ > u32 n; > + bool err_str; > ceph_decode_32_safe(p, end, n, bad); > while (n-- > 0) { > u32 len; > ceph_decode_32_safe(p, end, len, bad); > ceph_decode_need(p, end, len, bad); > + err_str = !strncmp(*p, "error_string", len); > *p += len; > ceph_decode_32_safe(p, end, len, bad); > ceph_decode_need(p, end, len, bad); > + if (err_str && strnstr(*p, "blacklisted", len)) > + *blacklisted = true; > *p += len; > } > return 0; > @@ -3067,6 +3072,7 @@ static void handle_session(struct ceph_mds_session *session, > u64 seq; > unsigned long features = 0; > int wake = 0; > + bool blacklisted = false; > > /* decode */ > ceph_decode_need(&p, end, sizeof(*h), bad); > @@ -3079,7 +3085,7 @@ static void handle_session(struct ceph_mds_session *session, > if (msg_version >= 3) { > u32 len; > /* version >= 2, metadata */ > - if (__decode_and_drop_session_metadata(&p, end) < 0) > + if (__decode_session_metadata(&p, end, &blacklisted) < 0) > goto bad; > /* version >= 3, feature bits */ > ceph_decode_32_safe(&p, end, len, bad); > @@ -3166,6 +3172,8 @@ static void handle_session(struct ceph_mds_session *session, > session->s_state = CEPH_MDS_SESSION_REJECTED; > cleanup_session_requests(mdsc, session); > remove_session_caps(session); > + if (blacklisted) > + mdsc->fsc->blacklisted = 1; > wake = 2; /* for good measure */ > break; > > @@ -4015,7 +4023,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) > mutex_unlock(&mdsc->mutex); > } > > +void maybe_recover_session(struct ceph_mds_client *mdsc) This function should be static > +{ > + struct ceph_fs_client *fsc = mdsc->fsc; > + > + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) > + return; > + > + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) > + return; > > + if (!READ_ONCE(fsc->blacklisted)) > + return; > + > + if (fsc->last_force_reconnect && > + time_before(jiffies, fsc->last_force_reconnect + HZ * 60 * 30)) > + return; > + > + pr_info("auto reconnect after blacklisted\n"); > + fsc->last_force_reconnect = jiffies; > + ceph_force_reconnect(fsc->sb); > +} > > /* > * delayed work -- periodically trim expired leases, renew caps with mds > @@ -4089,6 +4117,8 @@ static void delayed_work(struct work_struct *work) > > ceph_trim_snapid_map(mdsc); > > + maybe_recover_session(mdsc); > + > schedule_delayed(mdsc); > } > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > index b55ab2fd73db..8231ad96de48 100644 > --- a/fs/ceph/super.c > +++ b/fs/ceph/super.c > @@ -143,6 +143,7 @@ enum { > Opt_snapdirname, > Opt_mds_namespace, > Opt_fscache_uniq, > + Opt_recover_session, > Opt_last_string, > /* string args above */ > Opt_dirstat, > @@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { > /* int args above */ > {Opt_snapdirname, "snapdirname=%s"}, > {Opt_mds_namespace, "mds_namespace=%s"}, > + {Opt_recover_session, "recover_session=%s"}, > {Opt_fscache_uniq, "fsc=%s"}, > /* string args above */ > {Opt_dirstat, "dirstat"}, > @@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) > if (!fsopt->mds_namespace) > return -ENOMEM; > break; > + case Opt_recover_session: > + if (!strncmp(argstr[0].from, "no", > + argstr[0].to-argstr[0].from)) { > + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; > + } else if (!strncmp(argstr[0].from, "clean", > + argstr[0].to-argstr[0].from)) { > + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; > + } else { > + return -EINVAL; > + } > + break; > case Opt_fscache_uniq: > kfree(fsopt->fscache_uniq); > fsopt->fscache_uniq = kstrndup(argstr[0].from, > @@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) > > if (fsopt->mds_namespace) > seq_show_option(m, "mds_namespace", fsopt->mds_namespace); > + > + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) > + seq_show_option(m, "recover_session", "clean"); > + > if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) > seq_printf(m, ",wsize=%d", fsopt->wsize); > if (fsopt->rsize != CEPH_MAX_READ_SIZE) > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index f64a5271cb1a..358559c17c41 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -31,6 +31,7 @@ > #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ > #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) > > +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ > #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ > #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ > #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ > @@ -102,6 +103,9 @@ struct ceph_fs_client { > > unsigned long mount_state; > > + unsigned long last_force_reconnect; > + int blacklisted; > + > u32 filp_gen; > loff_t max_file_size; > -- Jeff Layton <jlayton@xxxxxxxxxx>