On Wed, 2019-06-05 at 17:09 +0800, Yan, Zheng wrote: > On Tue, Jun 4, 2019 at 9:12 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Tue, 2019-06-04 at 20:35 +0800, Yan, Zheng wrote: > > > On Tue, Jun 4, 2019 at 7:04 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > > > On Tue, 2019-06-04 at 17:39 +0800, Yan, Zheng wrote: > > > > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > > > > > --- > > > > > include/linux/ceph/libceph.h | 1 + > > > > > include/linux/ceph/messenger.h | 1 + > > > > > include/linux/ceph/mon_client.h | 1 + > > > > > include/linux/ceph/osd_client.h | 1 + > > > > > net/ceph/ceph_common.c | 8 ++++++++ > > > > > net/ceph/messenger.c | 5 +++++ > > > > > net/ceph/mon_client.c | 7 +++++++ > > > > > net/ceph/osd_client.c | 16 ++++++++++++++++ > > > > > 8 files changed, 40 insertions(+) > > > > > > > > > > diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h > > > > > index a3cddf5f0e60..f29959eed025 100644 > > > > > --- a/include/linux/ceph/libceph.h > > > > > +++ b/include/linux/ceph/libceph.h > > > > > @@ -291,6 +291,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); > > > > > struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); > > > > > u64 ceph_client_gid(struct ceph_client *client); > > > > > extern void ceph_destroy_client(struct ceph_client *client); > > > > > +extern void ceph_reset_client_addr(struct ceph_client *client); > > > > > extern int __ceph_open_session(struct ceph_client *client, > > > > > unsigned long started); > > > > > extern int ceph_open_session(struct ceph_client *client); > > > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > > > > > index 23895d178149..c4458dc6a757 100644 > > > > > --- a/include/linux/ceph/messenger.h > > > > > +++ b/include/linux/ceph/messenger.h > > > > > @@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void); > > > > > extern void ceph_messenger_init(struct ceph_messenger *msgr, > > > > > struct ceph_entity_addr *myaddr); > > > > > extern void ceph_messenger_fini(struct ceph_messenger *msgr); > > > > > +extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); > > > > > > > > > > extern void ceph_con_init(struct ceph_connection *con, void *private, > > > > > const struct ceph_connection_operations *ops, > > > > > diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h > > > > > index 3a4688af7455..0d8d890c6759 100644 > > > > > --- a/include/linux/ceph/mon_client.h > > > > > +++ b/include/linux/ceph/mon_client.h > > > > > @@ -110,6 +110,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, > > > > > > > > > > extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); > > > > > extern void ceph_monc_stop(struct ceph_mon_client *monc); > > > > > +extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); > > > > > > > > > > enum { > > > > > CEPH_SUB_MONMAP = 0, > > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > > > > index 2294f963dab7..a12b7fc9cfd6 100644 > > > > > --- a/include/linux/ceph/osd_client.h > > > > > +++ b/include/linux/ceph/osd_client.h > > > > > @@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void); > > > > > extern int ceph_osdc_init(struct ceph_osd_client *osdc, > > > > > struct ceph_client *client); > > > > > extern void ceph_osdc_stop(struct ceph_osd_client *osdc); > > > > > +extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); > > > > > > > > > > extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, > > > > > struct ceph_msg *msg); > > > > > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c > > > > > index 79eac465ec65..55210823d1cc 100644 > > > > > --- a/net/ceph/ceph_common.c > > > > > +++ b/net/ceph/ceph_common.c > > > > > @@ -693,6 +693,14 @@ void ceph_destroy_client(struct ceph_client *client) > > > > > } > > > > > EXPORT_SYMBOL(ceph_destroy_client); > > > > > > > > > > +void ceph_reset_client_addr(struct ceph_client *client) > > > > > +{ > > > > > + ceph_messenger_reset_nonce(&client->msgr); > > > > > + ceph_monc_reopen_session(&client->monc); > > > > > + ceph_osdc_reopen_osds(&client->osdc); > > > > > +} > > > > > +EXPORT_SYMBOL(ceph_reset_client_addr); > > > > > + > > > > > /* > > > > > * true if we have the mon map (and have thus joined the cluster) > > > > > */ > > > > > diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c > > > > > index 3ee380758ddd..cd03a1cba849 100644 > > > > > --- a/net/ceph/messenger.c > > > > > +++ b/net/ceph/messenger.c > > > > > @@ -3028,6 +3028,11 @@ static void con_fault(struct ceph_connection *con) > > > > > } > > > > > > > > > > > > > > > +void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) > > > > > +{ > > > > > + msgr->inst.addr.nonce += 1000000; > > > > > + encode_my_addr(msgr); > > > > > +} > > > > > > > > > > /* > > > > > * initialize a new messenger instance > > > > > diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c > > > > > index 895679d3529b..6dab6a94e9cc 100644 > > > > > --- a/net/ceph/mon_client.c > > > > > +++ b/net/ceph/mon_client.c > > > > > @@ -209,6 +209,13 @@ static void reopen_session(struct ceph_mon_client *monc) > > > > > __open_session(monc); > > > > > } > > > > > > > > > > +void ceph_monc_reopen_session(struct ceph_mon_client *monc) > > > > > +{ > > > > > + mutex_lock(&monc->mutex); > > > > > + reopen_session(monc); > > > > > + mutex_unlock(&monc->mutex); > > > > > +} > > > > > + > > > > > static void un_backoff(struct ceph_mon_client *monc) > > > > > { > > > > > monc->hunt_mult /= 2; /* reduce by 50% */ > > > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > > > > > index e6d31e0f0289..67e9466f27fd 100644 > > > > > --- a/net/ceph/osd_client.c > > > > > +++ b/net/ceph/osd_client.c > > > > > @@ -5089,6 +5089,22 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, > > > > > } > > > > > EXPORT_SYMBOL(ceph_osdc_call); > > > > > > > > > > +/* > > > > > + * reset all osd connections > > > > > + */ > > > > > +void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) > > > > > +{ > > > > > + struct rb_node *n; > > > > > + down_write(&osdc->lock); > > > > > + for (n = rb_first(&osdc->osds); n; ) { > > > > > + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); > > > > > + n = rb_next(n); > > > > > + if (!reopen_osd(osd)) > > > > > + kick_osd_requests(osd); > > > > > + } > > > > > + up_write(&osdc->lock); > > > > > +} > > > > > + > > > > > /* > > > > > * init, shutdown > > > > > */ > > > > > > > > I'd like to better understand what happens to file descriptors that were > > > > opened before the blacklisting occurred. > > > > > > > > Suppose I hold a POSIX lock on a O_RDWR fd, and then get blacklisted. > > > > The admin then remounts the mount. What happens to my fd? Can I carry on > > > > using it, or will it be shut down in some fashion? What about the lock? > > > > I've definitely lost it during the blacklisting -- how do we make the > > > > application aware of that fact? > > > > > > After 'mount -f', cephfs return -EIO for any operation on open files. > > > After remount, operations except fsync and close work as normal. fsync > > > and close return error if any dirty data got dropped. > > > > > > Current cephfs code already handle file lock. after session get > > > evicted, cephfs return -EIO for any lock operation until all local > > > locks are released. (see commit b3f8d68f38a87) > > > > > > > I think that's not sufficient. After a blacklisting event, we need to > > shut down all file descriptors that were open before the event. > > > > Consider: > > > > Suppose I have an application (maybe a distributed database) that opens > > a file O_RDWR and takes out a lock on the file. The client gets > > blacklisted and loses its lock, and then the admin remounts the thing > > leaving the fd intact. > > > > After the blacklisting, the application then performs some writes to the > > file and fsyncs and that all succeeds. Those writes are now being done > > without holding the file lock, even though the application thinks it has > > the lock. That seems like potential silent data corruption. > > > > Is it possible that application uses one file descriptor to do file > locking, and use different file descriptors to do read/write. If it > is, allowing read/write from new file descriptor is not safe either. > Sure, applications do all sorts of crazy things. PostgreSQL (for instance) has a process separate from the writers to handle fsyncs. That could also be problematic in this scenario. The main point here is that a file description is (partly) a representation of state granted by the MDS, and that state can no longer be considered valid after the client has been blacklisted. In practice, applications would need to have special handling to detect and deal with this situation, and >99% of them won't have it. Most applications will need to be restarted altogether after an event like this, which makes me question whether this is something we should do at all. -- Jeff Layton <jlayton@xxxxxxxxxx>