On 03/22/2013 01:59 AM, Gregory Farnum wrote: > On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@xxxxxxxxx> wrote: >> From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx> >> >> For mds cluster, rename operation may involve multiple MDS. If the >> rename source's auth MDS crashes after some witness MDS have prepared >> the rename but before the rename is committing. Later when the MDS >> recovers, its subtree map and linkages are different from the prepared >> MDS'. This causes problems for both subtree resolve and cache rejoin. >> The solution is, if the rename source's auth MDS fails, the prepared >> witness MDS query the master MDS if the operation is committing. If >> it's not, rollback the rename, then send resolve message to the >> recovering MDS. >> >> Another similar case is a prepared witness MDS crashes when the >> rename source's auth MDS has prepared or is preparing the operation. >> when the witness recovers, the master just delay sending the resolve >> ack message until the it commits the operation. >> >> This patch also updates Server::handle_client_rename(). Make preparing >> the rename source's auth MDS be the final step before committing the >> rename. > > Why? It's not immediately obvious to me what the benefit is, and the > commit message should state it. :) For the second case, it's possible the recovering MDS is anchor server. The master delays sending the resolve ack message until pending update is committed. To commit the pending update, the master needs anchor server's preparation ack. The master and the anchor server wait for each other. > >> >> Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx> >> --- >> src/mds/MDCache.cc | 75 +++++++++++++++++++++++++++++----------- >> src/mds/MDCache.h | 17 +++++++-- >> src/mds/Mutation.h | 2 ++ >> src/mds/Server.cc | 100 ++++++++++++++++++++++++++++------------------------- >> 4 files changed, 124 insertions(+), 70 deletions(-) >> >> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc >> index 9b37b1e..d934020 100644 >> --- a/src/mds/MDCache.cc >> +++ b/src/mds/MDCache.cc >> @@ -2491,7 +2491,7 @@ void MDCache::send_slave_resolves() >> if (!p->second->is_slave() || !p->second->slave_did_prepare()) >> continue; >> int master = p->second->slave_to_mds; >> - if (resolve_set.count(master)) { >> + if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) { >> dout(10) << " including uncommitted " << *p->second << dendl; >> if (!resolves.count(master)) >> resolves[master] = new MMDSResolve; >> @@ -2610,6 +2610,7 @@ void MDCache::handle_mds_failure(int who) >> >> resolve_gather.insert(who); >> discard_delayed_resolve(who); >> + ambiguous_slave_updates.erase(who); >> >> rejoin_gather.insert(who); >> rejoin_sent.erase(who); // i need to send another >> @@ -2642,14 +2643,46 @@ void MDCache::handle_mds_failure(int who) >> finish.push_back(p->second); >> } >> } >> + >> + if (p->second->is_slave() && >> + p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who && >> + mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) { >> + // rename srcdn's auth mds failed, resolve even I'm a survivor. >> + dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl; >> + add_ambiguous_slave_update(p->first, p->second->slave_to_mds); >> + } >> >> // failed node is slave? >> if (p->second->is_master() && !p->second->committing) { >> + if (p->second->more()->srcdn_auth_mds == who) { >> + dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds." >> + << who << " to recover" << dendl; >> + assert(p->second->more()->witnessed.count(who) == 0); >> + if (p->second->more()->is_ambiguous_auth) >> + p->second->clear_ambiguous_auth(); >> + // rename srcdn's auth mds failed, all witnesses will rollback >> + p->second->more()->witnessed.clear(); >> + pending_masters.erase(p->first); >> + } >> + >> if (p->second->more()->witnessed.count(who)) { >> - dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who >> - << dendl; >> - // discard this peer's prepare (if any) >> - p->second->more()->witnessed.erase(who); >> + int srcdn_auth = p->second->more()->srcdn_auth_mds; >> + if (srcdn_auth >= 0 && p->second->more()->waiting_on_slave.count(srcdn_auth)) { >> + dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds." >> + << p->second->more()->srcdn_auth_mds << " to reply" << dendl; >> + // waiting for the last slave (rename srcdn's auth mds), delay sending resolve ack >> + // until either the request is committing or the last slave also fails. >> + assert(p->second->more()->waiting_on_slave.size() == 1); >> + pending_masters.insert(p->first); > > The language about "last slave" is confusing me here — I'm with you > that this rename should only have one slave, but I don't think it ever > should have had more than one. Do you mean "only slave" or am I > missing something? Yes, I mean the 'only slave'. But the code 'more()->waiting_on_slave' also considers witnesses also as slave, that's why I use 'last slave'. Will update the comment. > >> + } else { >> + dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." >> + << who << " to recover" << dendl; >> + if (srcdn_auth >= 0) >> + assert(p->second->more()->witnessed.count(srcdn_auth) == 0); >> + >> + // discard this peer's prepare (if any) >> + p->second->more()->witnessed.erase(who); >> + } >> } >> >> if (p->second->more()->waiting_on_slave.count(who)) { >> @@ -2657,14 +2690,8 @@ void MDCache::handle_mds_failure(int who) >> << " to recover" << dendl; >> // retry request when peer recovers >> p->second->more()->waiting_on_slave.erase(who); >> - mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); >> - } >> - >> - if (p->second->has_more() && p->second->more()->is_ambiguous_auth && >> - p->second->more()->rename_inode->authority().first == who) { >> - dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who >> - << " to recover" << dendl; >> - p->second->clear_ambiguous_auth(); > > Why are you getting rid of waiting for the renamed inode's MDS? I > could be misremembering, but I believe we need it, and it might be > different from the source or dest dentry auths. The code is moved up. see above test "(p->second->more()->srcdn_auth_mds == who)" > >> + if (p->second->more()->waiting_on_slave.empty()) >> + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); >> } >> >> if (p->second->locking && p->second->locking_target_mds == who) >> @@ -2951,16 +2978,27 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) >> dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; >> int from = ack->get_source().num(); >> >> - if (!resolve_ack_gather.count(from)) { >> + if (!resolve_ack_gather.count(from) || >> + mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) { >> ack->put(); >> return; >> } >> >> + if (ambiguous_slave_updates.count(from)) { >> + assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from)); >> + assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); >> + } >> + >> for (vector<metareqid_t>::iterator p = ack->commit.begin(); >> p != ack->commit.end(); >> ++p) { >> dout(10) << " commit on slave " << *p << dendl; >> >> + if (ambiguous_slave_updates.count(from)) { >> + remove_ambiguous_slave_update(*p, from); >> + continue; >> + } >> + >> if (mds->is_resolve()) { >> // replay >> MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from); >> @@ -3020,13 +3058,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) >> } >> } >> >> - if (!mds->is_resolve()) { >> - for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin(); >> - p != active_requests.end(); ++p) >> - assert(p->second->slave_to_mds != from); >> - } >> - >> - resolve_ack_gather.erase(from); >> + if (!ambiguous_slave_updates.count(from)) >> + resolve_ack_gather.erase(from); >> if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) { >> send_subtree_resolves(); >> process_delayed_resolve(); >> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h >> index 8f262b9..a05ced7 100644 >> --- a/src/mds/MDCache.h >> +++ b/src/mds/MDCache.h >> @@ -327,9 +327,8 @@ protected: >> map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set >> >> set<metareqid_t> pending_masters; >> + map<int, set<metareqid_t> > ambiguous_slave_updates; >> >> - //map<metareqid_t, bool> ambiguous_slave_updates; // for log trimming. >> - //map<metareqid_t, Context*> waiting_for_slave_update_commit; >> friend class ESlaveUpdate; >> friend class ECommitted; >> >> @@ -353,6 +352,20 @@ protected: >> public: >> void remove_inode_recursive(CInode *in); >> >> + bool is_ambiguous_slave_update(metareqid_t reqid, int master) { >> + return ambiguous_slave_updates.count(master) && >> + ambiguous_slave_updates[master].count(reqid); >> + } >> + void add_ambiguous_slave_update(metareqid_t reqid, int master) { >> + ambiguous_slave_updates[master].insert(reqid); >> + } >> + void remove_ambiguous_slave_update(metareqid_t reqid, int master) { >> + assert(ambiguous_slave_updates[master].count(reqid)); >> + ambiguous_slave_updates[master].erase(reqid); >> + if (ambiguous_slave_updates[master].empty()) >> + ambiguous_slave_updates.erase(master); >> + } >> + >> void add_rollback(metareqid_t reqid, int master) { >> need_resolve_rollback[reqid] = master; >> } >> diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h >> index 5013f04..de122a5 100644 >> --- a/src/mds/Mutation.h >> +++ b/src/mds/Mutation.h >> @@ -207,6 +207,7 @@ struct MDRequest : public Mutation { >> >> // for rename >> set<int> extra_witnesses; // replica list from srcdn auth (rename) >> + int srcdn_auth_mds; >> version_t src_reanchor_atid; // src->dst >> version_t dst_reanchor_atid; // dst->stray >> bufferlist inode_import; >> @@ -233,6 +234,7 @@ struct MDRequest : public Mutation { >> bufferlist rollback_bl; >> >> More() : >> + srcdn_auth_mds(-1), >> src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), >> rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false), >> is_remote_frozen_authpin(false), is_inode_exporter(false), >> diff --git a/src/mds/Server.cc b/src/mds/Server.cc >> index 1330f11..b6e5665 100644 >> --- a/src/mds/Server.cc >> +++ b/src/mds/Server.cc >> @@ -5772,12 +5772,52 @@ void Server::handle_client_rename(MDRequest *mdr) >> if (mdr->now == utime_t()) >> mdr->now = ceph_clock_now(g_ceph_context); >> >> + // -- prepare anchor updates -- >> + if (!linkmerge || srcdnl->is_primary()) { >> + C_GatherBuilder anchorgather(g_ceph_context); >> + >> + if (srcdnl->is_primary() && >> + (srcdnl->get_inode()->is_anchored() || >> + (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors || >> + srcdnl->get_inode()->nested_anchors || >> + !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) && >> + !mdr->more()->src_reanchor_atid) { >> + dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl; >> + vector<Anchor> trace; >> + destdn->make_anchor_trace(trace, srcdnl->get_inode()); >> + mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(), >> + trace, &mdr->more()->src_reanchor_atid, >> + anchorgather.new_sub()); >> + } >> + if (destdnl->is_primary() && >> + destdnl->get_inode()->is_anchored() && >> + !mdr->more()->dst_reanchor_atid) { >> + dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl; >> + >> + assert(straydn); >> + vector<Anchor> trace; >> + straydn->make_anchor_trace(trace, destdnl->get_inode()); >> + >> + mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace, >> + &mdr->more()->dst_reanchor_atid, anchorgather.new_sub()); >> + } >> + >> + if (anchorgather.has_subs()) { >> + anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr)); >> + anchorgather.activate(); >> + return; // waiting for anchor prepares >> + } >> + >> + assert(g_conf->mds_kill_rename_at != 2); >> + } >> + >> // -- prepare witnesses -- >> >> // do srcdn auth last >> int last = -1; >> if (!srcdn->is_auth()) { >> last = srcdn->authority().first; >> + mdr->more()->srcdn_auth_mds = last; >> // ask auth of srci to mark srci as ambiguous auth if more than two MDS >> // are involved in the rename operation. >> if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) { >> @@ -5803,58 +5843,18 @@ void Server::handle_client_rename(MDRequest *mdr) >> if (!mdr->more()->waiting_on_slave.empty()) >> return; // we're waiting for a witness. >> >> - if (last >= 0 && >> - mdr->more()->witnessed.count(last) == 0 && >> - mdr->more()->waiting_on_slave.count(last) == 0) { >> + if (last >= 0 && mdr->more()->witnessed.count(last) == 0) { >> dout(10) << " preparing last witness (srcdn auth)" << dendl; >> + assert(mdr->more()->waiting_on_slave.count(last) == 0); >> _rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn); >> return; >> } >> >> // test hack: bail after slave does prepare, so we can verify it's _live_ rollback. >> if (!mdr->more()->slaves.empty() && !srci->is_dir()) >> - assert(g_conf->mds_kill_rename_at != 2); >> + assert(g_conf->mds_kill_rename_at != 3); >> if (!mdr->more()->slaves.empty() && srci->is_dir()) >> - assert(g_conf->mds_kill_rename_at != 3); >> - >> - // -- prepare anchor updates -- >> - if (!linkmerge || srcdnl->is_primary()) { >> - C_GatherBuilder anchorgather(g_ceph_context); >> - >> - if (srcdnl->is_primary() && >> - (srcdnl->get_inode()->is_anchored() || >> - (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors || >> - srcdnl->get_inode()->nested_anchors || >> - !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) && >> - !mdr->more()->src_reanchor_atid) { >> - dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl; >> - vector<Anchor> trace; >> - destdn->make_anchor_trace(trace, srcdnl->get_inode()); >> - mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(), >> - trace, &mdr->more()->src_reanchor_atid, >> - anchorgather.new_sub()); >> - } >> - if (destdnl->is_primary() && >> - destdnl->get_inode()->is_anchored() && >> - !mdr->more()->dst_reanchor_atid) { >> - dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl; >> - >> - assert(straydn); >> - vector<Anchor> trace; >> - straydn->make_anchor_trace(trace, destdnl->get_inode()); >> - >> - mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace, >> - &mdr->more()->dst_reanchor_atid, anchorgather.new_sub()); >> - } >> - >> - if (anchorgather.has_subs()) { >> - anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr)); >> - anchorgather.activate(); >> - return; // waiting for anchor prepares >> - } >> - >> assert(g_conf->mds_kill_rename_at != 4); >> - } >> >> // -- prepare journal entry -- >> mdr->ls = mdlog->get_current_segment(); >> @@ -6762,10 +6762,17 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, >> // abort >> // rollback_bl may be empty if we froze the inode but had to provide an expanded >> // witness list from the master, and they failed before we tried prep again. >> - if (mdr->more()->rollback_bl.length()) >> - do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); >> - else >> + if (mdr->more()->rollback_bl.length()) { >> + if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) { >> + mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds); >> + // rollback but preserve the slave request >> + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL); >> + } else >> + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); >> + } else { >> dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl; >> + mds->mdcache->request_finish(mdr); >> + } >> } >> } >> >> @@ -6825,7 +6832,6 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) >> dout(10) << "do_rename_rollback on " << rollback.reqid << dendl; >> // need to finish this update before sending resolve to claim the subtree >> mds->mdcache->add_rollback(rollback.reqid, master); >> - assert(mdr || mds->is_resolve()); >> >> Mutation *mut = new Mutation(rollback.reqid); >> mut->ls = mds->mdlog->get_current_segment(); >> -- >> 1.7.11.7 >> -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html