On 03/22/2013 05:23 AM, Gregory Farnum wrote: > On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@xxxxxxxxx> wrote: >> From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx> >> >> For MDS cluster, not all file system namespace operations that impact >> multiple MDS use two phase commit. Some operations use dentry link/unlink >> message to update replica dentry's linkage after they are committed by >> the master MDS. It's possible the master MDS crashes after journaling an >> operation, but before sending the dentry link/unlink messages. Later when >> the MDS recovers and receives cache rejoin messages from the surviving >> MDS, it will find linkage mismatch. > > I think you're here talking about link/unlink, and the MDS crashing > after it's sent out the LogEvent to the OSD but it hasn't actually > dispatched the observer slave requests. Is that right? This commit > message really confused me; I was trying to figure out which namespace > operations were hacking around a proper 2-phase commit by unlinking > and relinking inodes into the tree! (The link/unlink code also is > doing a 2-phase commit, it just doesn't force a particular order for > the journaling, which was previously left unhandled). I was talking about the cases that use MDCache::send_dentry_{link,unlink} to update replica dentry. There are a lot of usage in Server.cc. > >> >> The original cache rejoin code does not properly handle the case that >> dentry unlink messages were missing. Unlinked inodes were linked to stray >> dentries. So the cache rejoin ack message need push replicas of these >> stray dentries to the surviving MDS. >> >> This patch also adds code that handles cache expiration in the middle of >> cache rejoining. >> >> Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx> >> --- >> src/mds/MDCache.cc | 348 +++++++++++++++++++++++++++++++++++------------------ >> src/mds/MDCache.h | 1 + >> 2 files changed, 233 insertions(+), 116 deletions(-) >> >> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc >> index 344777e..38b1fdf 100644 >> --- a/src/mds/MDCache.cc >> +++ b/src/mds/MDCache.cc >> @@ -3536,7 +3536,6 @@ void MDCache::rejoin_send_rejoins() >> } else { >> // strong >> if (p->first == 0 && root) { >> - p->second->add_weak_inode(root->vino()); >> p->second->add_strong_inode(root->vino(), >> root->get_replica_nonce(), >> root->get_caps_wanted(), >> @@ -3550,7 +3549,6 @@ void MDCache::rejoin_send_rejoins() >> } >> >> if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) { >> - p->second->add_weak_inode(in->vino()); >> p->second->add_strong_inode(in->vino(), >> in->get_replica_nonce(), >> in->get_caps_wanted(), >> @@ -3567,6 +3565,8 @@ void MDCache::rejoin_send_rejoins() >> for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin(); >> p != active_requests.end(); >> ++p) { >> + if ( p->second->is_slave()) >> + continue; >> // auth pins >> for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin(); >> q != p->second->remote_auth_pins.end(); >> @@ -4226,6 +4226,8 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) >> rejoin_potential_updated_scatterlocks.insert(in); >> } >> >> + rejoin_unlinked_inodes[from].clear(); >> + >> // surviving peer may send incorrect dirfrag here (maybe they didn't >> // get the fragment notify, or maybe we rolled back?). we need to >> // infer the right frag and get them with the program. somehow. >> @@ -4332,105 +4334,125 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) >> >> dn->add_replica(from, q->second.nonce); >> dout(10) << " have " << *dn << dendl; >> - >> - // inode? >> - if (dnl->is_primary()) { >> - CInode *in = dnl->get_inode(); >> - assert(in); >> - >> - if (strong->strong_inodes.count(in->vino())) { >> - MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->vino()]; >> >> - // caps_wanted >> - if (is.caps_wanted) { >> - in->mds_caps_wanted[from] = is.caps_wanted; >> - dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) >> - << " on " << *in << dendl; >> - } >> - >> - // scatterlocks? >> - // infer state from replica state: >> - // * go to MIX if they might have wrlocks >> - // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock) >> - in->filelock.infer_state_from_strong_rejoin(is.filelock, true); // maybe also go to LOCK >> - in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false); >> - in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false); >> - >> - // auth pin? >> - if (strong->authpinned_inodes.count(in->vino())) { >> - MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()]; >> - dout(10) << " inode authpin by " << r << " on " << *in << dendl; >> - >> - // get/create slave mdrequest >> - MDRequest *mdr; >> - if (have_request(r.reqid)) >> - mdr = request_get(r.reqid); >> - else >> - mdr = request_start_slave(r.reqid, r.attempt, from); >> - if (strong->frozen_authpin_inodes.count(in->vino())) { >> - assert(!in->get_num_auth_pins()); >> - mdr->freeze_auth_pin(in); >> - } else { >> - assert(!in->is_frozen_auth_pin()); >> - } >> - mdr->auth_pin(in); >> - } >> - // xlock(s)? >> - if (strong->xlocked_inodes.count(in->vino())) { >> - for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin(); >> - r != strong->xlocked_inodes[in->vino()].end(); >> - ++r) { >> - SimpleLock *lock = in->get_lock(r->first); >> - dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl; >> - MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above. >> - assert(mdr->is_auth_pinned(in)); >> - if (lock->is_stable()) >> - in->auth_pin(lock); >> - lock->set_state(LOCK_XLOCK); >> - if (lock == &in->filelock) >> - in->loner_cap = -1; >> - lock->get_xlock(mdr, mdr->get_client()); >> - mdr->xlocks.insert(lock); >> - mdr->locks.insert(lock); >> - } >> - } >> - // wrlock(s)? >> - if (strong->wrlocked_inodes.count(in->vino())) { >> - for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin(); >> - r != strong->wrlocked_inodes[in->vino()].end(); >> - ++r) { >> - SimpleLock *lock = in->get_lock(r->first); >> - dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl; >> - MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above. >> - assert(mdr->is_auth_pinned(in)); >> - lock->set_state(LOCK_LOCK); >> - if (lock == &in->filelock) >> - in->loner_cap = -1; >> - lock->get_wrlock(true); >> - mdr->wrlocks.insert(lock); >> - mdr->locks.insert(lock); >> - } >> + if (dnl->is_primary()) { >> + if (q->second.is_primary()) { >> + if (!(vinodeno_t(q->second.ino, q->first.snapid) == dnl->get_inode()->vino())) { > > Maybe it's worth adding an operator!= for vinodeno_t, since you seem > to use this a couple times. > >> + // the survivor missed MDentryUnlink+MDentryLink messages ? >> + assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); >> + CInode *in = get_inode(q->second.ino, q->first.snapid); >> + assert(in); >> + rejoin_unlinked_inodes[from].insert(in); >> + dout(7) << " sender has primary dentry but wrong inode" << dendl; >> } >> } else { >> - dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl; >> + // the survivor missed MDentryLink message ? >> + assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); >> + dout(7) << " sender doesn't primay dentry" << dendl; > > doesn't have primary? or something else? will fix. > >> + } >> + } else { >> + if (q->second.is_primary()) { >> + // the survivor missed MDentryUnlink message ? >> + CInode *in = get_inode(q->second.ino, q->first.snapid); >> + assert(in); >> + rejoin_unlinked_inodes[from].insert(in); >> + dout(7) << " sender has primary dentry but we don't" << dendl; >> } >> - >> - in->add_replica(from, p->second.nonce); >> - dout(10) << " have " << *in << dendl; >> } >> } >> } >> >> - // base inodes? (root, stray, etc.) >> - for (set<vinodeno_t>::iterator p = strong->weak_inodes.begin(); >> - p != strong->weak_inodes.end(); >> + for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin(); >> + p != strong->strong_inodes.end(); >> ++p) { >> - CInode *in = get_inode(*p); >> - dout(10) << " have base " << *in << dendl; >> - in->add_replica(from); >> + CInode *in = get_inode(p->first); >> + assert(in); >> + in->add_replica(from, p->second.nonce); >> + dout(10) << " have " << *in << dendl; >> + >> + MMDSCacheRejoin::inode_strong &is = p->second; >> + >> + // caps_wanted >> + if (is.caps_wanted) { >> + in->mds_caps_wanted[from] = is.caps_wanted; >> + dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) >> + << " on " << *in << dendl; >> + } >> + >> + // scatterlocks? >> + // infer state from replica state: >> + // * go to MIX if they might have wrlocks >> + // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock) >> + in->filelock.infer_state_from_strong_rejoin(is.filelock, true); // maybe also go to LOCK >> + in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false); >> + in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false); >> + >> + // auth pin? >> + if (strong->authpinned_inodes.count(in->vino())) { >> + MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()]; >> + dout(10) << " inode authpin by " << r << " on " << *in << dendl; >> + >> + // get/create slave mdrequest >> + MDRequest *mdr; >> + if (have_request(r.reqid)) >> + mdr = request_get(r.reqid); >> + else >> + mdr = request_start_slave(r.reqid, r.attempt, from); >> + if (strong->frozen_authpin_inodes.count(in->vino())) { >> + assert(!in->get_num_auth_pins()); >> + mdr->freeze_auth_pin(in); >> + } else { >> + assert(!in->is_frozen_auth_pin()); >> + } >> + mdr->auth_pin(in); >> + } >> + // xlock(s)? >> + if (strong->xlocked_inodes.count(in->vino())) { >> + for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin(); >> + q != strong->xlocked_inodes[in->vino()].end(); >> + ++q) { >> + SimpleLock *lock = in->get_lock(q->first); >> + dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl; >> + MDRequest *mdr = request_get(q->second.reqid); // should have this from auth_pin above. >> + assert(mdr->is_auth_pinned(in)); >> + if (lock->is_stable()) >> + in->auth_pin(lock); >> + lock->set_state(LOCK_XLOCK); >> + if (lock == &in->filelock) >> + in->loner_cap = -1; >> + lock->get_xlock(mdr, mdr->get_client()); >> + mdr->xlocks.insert(lock); >> + mdr->locks.insert(lock); >> + } >> + } >> + // wrlock(s)? >> + if (strong->wrlocked_inodes.count(in->vino())) { >> + for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin(); >> + q != strong->wrlocked_inodes[in->vino()].end(); >> + ++q) { >> + SimpleLock *lock = in->get_lock(q->first); >> + dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl; >> + MDRequest *mdr = request_get(q->second.reqid); // should have this from auth_pin above. >> + assert(mdr->is_auth_pinned(in)); >> + lock->set_state(LOCK_LOCK); >> + if (lock == &in->filelock) >> + in->loner_cap = -1; >> + lock->get_wrlock(true); >> + mdr->wrlocks.insert(lock); >> + mdr->locks.insert(lock); >> + } >> + } >> } >> >> - >> + // unlinked inodes should be in stray >> + for (set<CInode*>::iterator p = rejoin_unlinked_inodes[from].begin(); >> + p != rejoin_unlinked_inodes[from].end(); >> + ++p) { >> + CInode *in = *p; >> + dout(7) << " unlinked inode " << *in << dendl; >> + assert(in->get_parent_dn()); >> + assert(in->is_replica(from)); >> + } > > I'm not clear on why we need to check this here — the previous for > loop wasn't adding any inodes to the cache, so shouldn't we just check > these conditions as we add them? > will update the code. Thanks Yan, Zheng >> >> // done? >> assert(rejoin_gather.count(from)); >> @@ -4448,6 +4470,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) >> dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; >> int from = ack->get_source().num(); >> >> + // for sending cache expire message >> + list<CInode*> isolated_inodes; >> + >> // dirs >> for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin(); >> p != ack->strong_dirfrags.end(); >> @@ -4455,7 +4480,29 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) >> // we may have had incorrect dir fragmentation; refragment based >> // on what they auth tells us. >> CDir *dir = get_force_dirfrag(p->first); >> - assert(dir); >> + if (!dir) { >> + CInode *diri = get_inode(p->first.ino); >> + if (!diri) { >> + // barebones inode; the full inode loop below will clean up. >> + diri = new CInode(this, false); >> + diri->inode.ino = p->first.ino; >> + diri->inode.mode = S_IFDIR; >> + if (MDS_INO_MDSDIR(p->first.ino)) { >> + diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN); >> + add_inode(diri); >> + dout(10) << " add inode " << *diri << dendl; >> + } else { >> + diri->inode_auth = CDIR_AUTH_UNDEF; >> + isolated_inodes.push_back(diri); >> + dout(10) << " unconnected dirfrag " << p->first << dendl; >> + } >> + } >> + // barebones dirfrag; the full dirfrag loop below will clean up. >> + dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false)); >> + if (dir->authority().first != from) >> + adjust_subtree_auth(dir, from); >> + dout(10) << " add dirfrag " << *dir << dendl; >> + } >> >> dir->set_replica_nonce(p->second.nonce); >> dir->state_clear(CDir::STATE_REJOINING); >> @@ -4467,7 +4514,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) >> q != dmap.end(); >> ++q) { >> CDentry *dn = dir->lookup(q->first.name, q->first.snapid); >> - assert(dn); >> + if(!dn) >> + dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid); >> + >> CDentry::linkage_t *dnl = dn->get_linkage(); >> >> assert(dn->last == q->first.snapid); >> @@ -4476,33 +4525,48 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) >> dn->first = q->second.first; >> } >> >> + // may have bad linkage if we missed dentry link/unlink messages >> + if (dnl->is_primary()) { >> + CInode *in = dnl->get_inode(); >> + if (!q->second.is_primary() || >> + !(vinodeno_t(q->second.ino, q->first.snapid) == in->vino())) { >> + dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl; >> + dir->unlink_inode(dn); >> + } >> + } else if (dnl->is_remote()) { >> + if (!q->second.is_remote() || >> + q->second.remote_ino != dnl->get_remote_ino() || >> + q->second.remote_d_type != dnl->get_remote_d_type()) { >> + dout(10) << " had bad linkage for " << *dn << dendl; >> + dir->unlink_inode(dn); >> + } >> + } else { >> + if (!q->second.is_null()) >> + dout(10) << " had bad linkage for " << *dn << dendl; >> + } >> + >> // hmm, did we have the proper linkage here? >> - if (dnl->is_null() && >> - !q->second.is_null()) { >> - dout(10) << " had bad (missing) linkage for " << *dn << dendl; >> + if (dnl->is_null() && !q->second.is_null()) { >> if (q->second.is_remote()) { >> dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type); >> } else { >> CInode *in = get_inode(q->second.ino, q->first.snapid); >> - assert(in == 0); // a rename would have been caught be the resolve stage. >> - // barebones inode; the full inode loop below will clean up. >> - in = new CInode(this, false, q->second.first, q->first.snapid); >> - in->inode.ino = q->second.ino; >> - add_inode(in); >> + if (!in) { >> + // barebones inode; assume it's dir, the full inode loop below will clean up. >> + in = new CInode(this, false, q->second.first, q->first.snapid); >> + in->inode.ino = q->second.ino; >> + in->inode.mode = S_IFDIR; >> + add_inode(in); >> + dout(10) << " add inode " << *in << dendl; >> + } else if (in->get_parent_dn()) { >> + dout(10) << " had bad linkage for " << *(in->get_parent_dn()) >> + << ", unlinking " << *in << dendl; >> + in->get_parent_dir()->unlink_inode(in->get_parent_dn()); >> + } >> dn->dir->link_primary_inode(dn, in); >> } >> } >> - else if (!dnl->is_null() && >> - q->second.is_null()) { >> - dout(0) << " had bad linkage for " << *dn << dendl; >> - /* >> - * this should happen: >> - * if we're a survivor, any unlink should commit or rollback during >> - * the resolve stage. >> - * if we failed, we shouldn't have non-auth leaf dentries at all >> - */ >> - assert(0); // uh oh. >> - } >> + >> dn->set_replica_nonce(q->second.nonce); >> dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters); >> dn->state_clear(CDentry::STATE_REJOINING); >> @@ -4564,6 +4628,21 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) >> dout(10) << " got inode locks " << *in << dendl; >> } >> >> + // trim unconnected subtree >> + if (!isolated_inodes.empty()) { >> + map<int, MCacheExpire*> expiremap; >> + for (list<CInode*>::iterator p = isolated_inodes.begin(); >> + p != isolated_inodes.end(); >> + ++p) { >> + list<CDir*> ls; >> + (*p)->get_dirfrags(ls); >> + trim_dirfrag(*ls.begin(), 0, expiremap); >> + assert((*p)->get_num_ref() == 0); >> + delete *p; >> + } >> + send_expire_messages(expiremap); >> + } >> + >> // done? >> assert(rejoin_ack_gather.count(from)); >> rejoin_ack_gather.erase(from); >> @@ -5164,6 +5243,37 @@ void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snap >> void MDCache::rejoin_send_acks() >> { >> dout(7) << "rejoin_send_acks" << dendl; >> + >> + // replicate stray >> + for (map<int, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin(); >> + p != rejoin_unlinked_inodes.end(); >> + ++p) { >> + for (set<CInode*>::iterator q = p->second.begin(); >> + q != p->second.end(); >> + ++q) { >> + CInode *in = *q; >> + dout(7) << " unlinked inode " << *in << dendl; >> + // inode expired >> + if (!in->is_replica(p->first)) >> + continue; >> + while (1) { >> + CDentry *dn = in->get_parent_dn(); >> + if (dn->is_replica(p->first)) >> + break; >> + dn->add_replica(p->first); >> + CDir *dir = dn->get_dir(); >> + if (dir->is_replica(p->first)) >> + break; >> + dir->add_replica(p->first); >> + in = dir->get_inode(); >> + if (in->is_replica(p->first)) >> + break; >> + if (in->is_base()) >> + break; >> + } >> + } >> + } >> + rejoin_unlinked_inodes.clear(); >> >> // send acks to everyone in the recovery set >> map<int,MMDSCacheRejoin*> ack; >> @@ -5203,23 +5313,29 @@ void MDCache::rejoin_send_acks() >> CDentry *dn = q->second; >> CDentry::linkage_t *dnl = dn->get_linkage(); >> >> + // inode >> + CInode *in = NULL; >> + if (dnl->is_primary()) >> + in = dnl->get_inode(); >> + >> // dentry >> for (map<int,int>::iterator r = dn->replicas_begin(); >> r != dn->replicas_end(); >> - ++r) >> + ++r) { >> ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last, >> dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), >> dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), >> dnl->is_remote() ? dnl->get_remote_d_type():0, >> ++r->second, >> dn->lock.get_replica_state()); >> + // peer missed MDentrylink message ? >> + if (in && !in->is_replica(r->first)) >> + in->add_replica(r->first); >> + } >> >> - if (!dnl->is_primary()) >> + if (!in) >> continue; >> >> - // inode >> - CInode *in = dnl->get_inode(); >> - >> for (map<int,int>::iterator r = in->replicas_begin(); >> r != in->replicas_end(); >> ++r) { >> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h >> index 85f5d65..09cc092 100644 >> --- a/src/mds/MDCache.h >> +++ b/src/mds/MDCache.h >> @@ -416,6 +416,7 @@ protected: >> set<CInode*> rejoin_undef_inodes; >> set<CInode*> rejoin_potential_updated_scatterlocks; >> set<CDir*> rejoin_undef_dirfrags; >> + map<int, set<CInode*> > rejoin_unlinked_inodes; >> >> vector<CInode*> rejoin_recover_q, rejoin_check_q; >> list<Context*> rejoin_waiters; >> -- >> 1.7.11.7 >> -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html