From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx> The MDS should not trim objects in non-auth subtree immediately after replaying a slave rename. Because the slave rename may require rollback later and these objects are needed for rollback. Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx> --- src/mds/MDCache.cc | 93 +++++++++++++++++++++++++++++++++++++--------- src/mds/MDCache.h | 5 +++ src/mds/Mutation.h | 5 ++- src/mds/events/EMetaBlob.h | 3 +- src/mds/journal.cc | 53 +++++++++++++------------- 5 files changed, 113 insertions(+), 46 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index adcf8c1..5a6d3f2 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2867,16 +2867,14 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) if (mds->is_resolve()) { // replay - assert(uncommitted_slave_updates[from].count(*p)); + MDSlaveUpdate *su = get_uncommitted_slave_update(from, *p); + assert(su); + // log commit mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, - ESlaveUpdate::OP_COMMIT, - uncommitted_slave_updates[from][*p]->origop)); + ESlaveUpdate::OP_COMMIT, su->origop)); - delete uncommitted_slave_updates[from][*p]; - uncommitted_slave_updates[from].erase(*p); - if (uncommitted_slave_updates[from].empty()) - uncommitted_slave_updates.erase(from); + finish_uncommitted_slave_update(from, *p); mds->mdlog->wait_for_safe(new C_MDC_SlaveCommit(this, from, *p)); mds->mdlog->flush(); @@ -2893,28 +2891,26 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) dout(10) << " abort on slave " << *p << dendl; if (mds->is_resolve()) { - assert(uncommitted_slave_updates[from].count(*p)); + MDSlaveUpdate *su = get_uncommitted_slave_update(from, *p); + assert(su); // perform rollback (and journal a rollback entry) // note: this will hold up the resolve a bit, until the rollback entries journal. - switch (uncommitted_slave_updates[from][*p]->origop) { + switch (su->origop) { case ESlaveUpdate::LINK: - mds->server->do_link_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + mds->server->do_link_rollback(su->rollback, from, 0); break; case ESlaveUpdate::RENAME: - mds->server->do_rename_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + mds->server->do_rename_rollback(su->rollback, from, 0); break; case ESlaveUpdate::RMDIR: - mds->server->do_rmdir_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + mds->server->do_rmdir_rollback(su->rollback, from, 0); break; default: assert(0); } - delete uncommitted_slave_updates[from][*p]; - uncommitted_slave_updates[from].erase(*p); - if (uncommitted_slave_updates[from].empty()) - uncommitted_slave_updates.erase(from); + finish_uncommitted_slave_update(from, *p); } else { MDRequest *mdr = request_get(*p); if (mdr->more()->slave_commit) { @@ -2939,7 +2935,63 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) ack->put(); } +void MDCache::add_uncommitted_slave_update(int master, metareqid_t reqid, MDSlaveUpdate *su) +{ + assert(uncommitted_slave_updates[master].count(reqid) == 0); + uncommitted_slave_updates[master][reqid] = su; + if (su->rename_olddir) + uncommitted_slave_rename_olddir[su->rename_olddir]++; + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++) + uncommitted_slave_unlink[*p]++; +} + +void MDCache::finish_uncommitted_slave_update(int master, metareqid_t reqid) +{ + assert(uncommitted_slave_updates[master].count(reqid)); + MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid]; + + uncommitted_slave_updates[master].erase(reqid); + if (uncommitted_slave_updates[master].empty()) + uncommitted_slave_updates.erase(master); + // discard the non-auth subtree we renamed out of + if (su->rename_olddir) { + uncommitted_slave_rename_olddir[su->rename_olddir]--; + if (uncommitted_slave_rename_olddir[su->rename_olddir] == 0) { + uncommitted_slave_rename_olddir.erase(su->rename_olddir); + // in the resolve stage, there probably are unfinished rename rollback, + // trim_non_auth_subtree() does not recognize projected linkage change. + // non-auth subtrees will be trimmed when the resolve stage finishes. + if (!mds->is_resolve()) { + CDir *root = get_subtree_root(su->rename_olddir); + if (root->get_dir_auth() == CDIR_AUTH_UNDEF) + try_trim_non_auth_subtree(root); + } + } + } + // removed the inodes that were unlinked by slave update + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++) { + CInode *in = *p; + uncommitted_slave_unlink[in]--; + if (uncommitted_slave_unlink[in] == 0) { + uncommitted_slave_unlink.erase(in); + if (!in->get_projected_parent_dn()) + mds->mdcache->remove_inode_recursive(in); + } + } + delete su; +} +MDSlaveUpdate* MDCache::get_uncommitted_slave_update(int master, metareqid_t reqid) +{ + + MDSlaveUpdate* su = NULL; + if (uncommitted_slave_updates.count(master) && + uncommitted_slave_updates[master].count(reqid)) { + su = uncommitted_slave_updates[master][reqid]; + assert(su); + } + return su; +} void MDCache::disambiguate_imports() { @@ -5788,6 +5840,10 @@ bool MDCache::trim_non_auth_subtree(CDir *dir) { dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl; + // preserve the dir for rollback + if (uncommitted_slave_rename_olddir.count(dir)) + return true; + bool keep_dir = false; CDir::map_t::iterator j = dir->begin(); CDir::map_t::iterator i = j; @@ -5805,7 +5861,9 @@ bool MDCache::trim_non_auth_subtree(CDir *dir) for (list<CDir*>::iterator subdir = subdirs.begin(); subdir != subdirs.end(); ++subdir) { - if ((*subdir)->is_subtree_root() || my_ambiguous_imports.count((*subdir)->dirfrag())) { + if (uncommitted_slave_rename_olddir.count(*subdir) || // preserve the dir for rollback + my_ambiguous_imports.count((*subdir)->dirfrag()) || + (*subdir)->is_subtree_root()) { keep_inode = true; dout(10) << "trim_non_auth_subtree(" << dir << ") subdir " << *subdir << "is kept!" << dendl; } @@ -5837,6 +5895,7 @@ bool MDCache::trim_non_auth_subtree(CDir *dir) dir->remove_dentry(dn); } } + /** * We've now checked all our children and deleted those that need it. * Now return to caller, and tell them if *we're* a keeper. diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 31c7467..ecf5b29 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -312,6 +312,8 @@ protected: map<int, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports; map<int, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay. + map<CDir*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit. + map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit. // track master requests whose slaves haven't acknowledged commit struct umaster { @@ -337,6 +339,9 @@ protected: void disambiguate_imports(); void recalc_auth_bits(); void trim_unlinked_inodes(); + void add_uncommitted_slave_update(int master, metareqid_t reqid, MDSlaveUpdate*); + void finish_uncommitted_slave_update(int master, metareqid_t reqid); + MDSlaveUpdate* get_uncommitted_slave_update(int master, metareqid_t reqid); public: void remove_inode_recursive(CInode *in); diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index d0d3eca..36d62a7 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -298,10 +298,13 @@ struct MDSlaveUpdate { bufferlist rollback; elist<MDSlaveUpdate*>::item item; Context *waiter; + CDir* rename_olddir; + set<CInode*> unlinked; MDSlaveUpdate(int oo, bufferlist &rbl, elist<MDSlaveUpdate*> &list) : origop(oo), item(this), - waiter(0) { + waiter(0), + rename_olddir(0) { rollback.claim(rbl); list.push_back(&item); } diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 9bbd615..77ceb94 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -27,6 +27,7 @@ class MDS; class MDLog; class LogSegment; +class MDSlaveUpdate; /* * a bunch of metadata in the journal @@ -674,7 +675,7 @@ private: } void update_segment(LogSegment *ls); - void replay(MDS *mds, LogSegment *ls=0); + void replay(MDS *mds, LogSegment *ls, MDSlaveUpdate *su=NULL); }; WRITE_CLASS_ENCODER(EMetaBlob) WRITE_CLASS_ENCODER(EMetaBlob::fullbit) diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 72a5e5e..3e7e0fa 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -415,7 +415,7 @@ void EMetaBlob::fullbit::update_inode(MDS *mds, CInode *in) in->old_inodes = old_inodes; } -void EMetaBlob::replay(MDS *mds, LogSegment *logseg) +void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) { dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl; @@ -676,8 +676,12 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) // see if we can discard the subtree we renamed out of CDir *root = mds->mdcache->get_subtree_root(olddir); - if (root->get_dir_auth() == CDIR_AUTH_UNDEF) - mds->mdcache->try_trim_non_auth_subtree(root); + if (root->get_dir_auth() == CDIR_AUTH_UNDEF) { + if (slaveup) // preserve the old dir until slave commit + slaveup->rename_olddir = olddir; + else + mds->mdcache->try_trim_non_auth_subtree(root); + } } // if we are the srci importer, we'll also have some dirfrags we have to open up... @@ -710,8 +714,12 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); p++) unlinked.erase(*p); dout(10) << " unlinked set contains " << unlinked << dendl; - for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) - mds->mdcache->remove_inode_recursive(p->first); + for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) { + if (slaveup) // preserve unlinked inodes until slave commit + slaveup->unlinked.insert(p->first); + else + mds->mdcache->remove_inode_recursive(p->first); + } } // table client transactions @@ -1107,23 +1115,21 @@ void ECommitted::replay(MDS *mds) void ESlaveUpdate::replay(MDS *mds) { + MDSlaveUpdate *su; switch (op) { case ESlaveUpdate::OP_PREPARE: dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master << ": applying commit, saving rollback info" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0); - commit.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master][reqid] = - new MDSlaveUpdate(origop, rollback, _segment->slave_updates); + su = new MDSlaveUpdate(origop, rollback, _segment->slave_updates); + commit.replay(mds, _segment, su); + mds->mdcache->add_uncommitted_slave_update(master, reqid, su); break; case ESlaveUpdate::OP_COMMIT: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { + su = mds->mdcache->get_uncommitted_slave_update(master, reqid); + if (su) { dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl; - delete mds->mdcache->uncommitted_slave_updates[master][reqid]; - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - if (mds->mdcache->uncommitted_slave_updates[master].empty()) - mds->mdcache->uncommitted_slave_updates.erase(master); + mds->mdcache->finish_uncommitted_slave_update(master, reqid); } else { dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << ": ignoring, no previously saved prepare" << dendl; @@ -1131,19 +1137,12 @@ void ESlaveUpdate::replay(MDS *mds) break; case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master - << ": applying rollback commit blob" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid)); - commit.replay(mds, _segment); - delete mds->mdcache->uncommitted_slave_updates[master][reqid]; - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - if (mds->mdcache->uncommitted_slave_updates[master].empty()) - mds->mdcache->uncommitted_slave_updates.erase(master); - } else { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master - << ": ignoring, no previously saved prepare" << dendl; - } + dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master + << ": applying rollback commit blob" << dendl; + su = mds->mdcache->get_uncommitted_slave_update(master, reqid); + if (su) + mds->mdcache->finish_uncommitted_slave_update(master, reqid); + commit.replay(mds, _segment); break; default: -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html