On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@xxxxxxxxx> wrote: > From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx> > > For MDS cluster, not all file system namespace operations that impact > multiple MDS use two phase commit. Some operations use dentry link/unlink > message to update replica dentry's linkage after they are committed by > the master MDS. It's possible the master MDS crashes after journaling an > operation, but before sending the dentry link/unlink messages. Later when > the MDS recovers and receives cache rejoin messages from the surviving > MDS, it will find linkage mismatch. I think you're here talking about link/unlink, and the MDS crashing after it's sent out the LogEvent to the OSD but it hasn't actually dispatched the observer slave requests. Is that right? This commit message really confused me; I was trying to figure out which namespace operations were hacking around a proper 2-phase commit by unlinking and relinking inodes into the tree! (The link/unlink code also is doing a 2-phase commit, it just doesn't force a particular order for the journaling, which was previously left unhandled). > > The original cache rejoin code does not properly handle the case that > dentry unlink messages were missing. Unlinked inodes were linked to stray > dentries. So the cache rejoin ack message need push replicas of these > stray dentries to the surviving MDS. > > This patch also adds code that handles cache expiration in the middle of > cache rejoining. > > Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx> > --- > src/mds/MDCache.cc | 348 +++++++++++++++++++++++++++++++++++------------------ > src/mds/MDCache.h | 1 + > 2 files changed, 233 insertions(+), 116 deletions(-) > > diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc > index 344777e..38b1fdf 100644 > --- a/src/mds/MDCache.cc > +++ b/src/mds/MDCache.cc > @@ -3536,7 +3536,6 @@ void MDCache::rejoin_send_rejoins() > } else { > // strong > if (p->first == 0 && root) { > - p->second->add_weak_inode(root->vino()); > p->second->add_strong_inode(root->vino(), > root->get_replica_nonce(), > root->get_caps_wanted(), > @@ -3550,7 +3549,6 @@ void MDCache::rejoin_send_rejoins() > } > > if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) { > - p->second->add_weak_inode(in->vino()); > p->second->add_strong_inode(in->vino(), > in->get_replica_nonce(), > in->get_caps_wanted(), > @@ -3567,6 +3565,8 @@ void MDCache::rejoin_send_rejoins() > for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin(); > p != active_requests.end(); > ++p) { > + if ( p->second->is_slave()) > + continue; > // auth pins > for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin(); > q != p->second->remote_auth_pins.end(); > @@ -4226,6 +4226,8 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) > rejoin_potential_updated_scatterlocks.insert(in); > } > > + rejoin_unlinked_inodes[from].clear(); > + > // surviving peer may send incorrect dirfrag here (maybe they didn't > // get the fragment notify, or maybe we rolled back?). we need to > // infer the right frag and get them with the program. somehow. > @@ -4332,105 +4334,125 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) > > dn->add_replica(from, q->second.nonce); > dout(10) << " have " << *dn << dendl; > - > - // inode? > - if (dnl->is_primary()) { > - CInode *in = dnl->get_inode(); > - assert(in); > - > - if (strong->strong_inodes.count(in->vino())) { > - MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->vino()]; > > - // caps_wanted > - if (is.caps_wanted) { > - in->mds_caps_wanted[from] = is.caps_wanted; > - dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) > - << " on " << *in << dendl; > - } > - > - // scatterlocks? > - // infer state from replica state: > - // * go to MIX if they might have wrlocks > - // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock) > - in->filelock.infer_state_from_strong_rejoin(is.filelock, true); // maybe also go to LOCK > - in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false); > - in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false); > - > - // auth pin? > - if (strong->authpinned_inodes.count(in->vino())) { > - MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()]; > - dout(10) << " inode authpin by " << r << " on " << *in << dendl; > - > - // get/create slave mdrequest > - MDRequest *mdr; > - if (have_request(r.reqid)) > - mdr = request_get(r.reqid); > - else > - mdr = request_start_slave(r.reqid, r.attempt, from); > - if (strong->frozen_authpin_inodes.count(in->vino())) { > - assert(!in->get_num_auth_pins()); > - mdr->freeze_auth_pin(in); > - } else { > - assert(!in->is_frozen_auth_pin()); > - } > - mdr->auth_pin(in); > - } > - // xlock(s)? > - if (strong->xlocked_inodes.count(in->vino())) { > - for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin(); > - r != strong->xlocked_inodes[in->vino()].end(); > - ++r) { > - SimpleLock *lock = in->get_lock(r->first); > - dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl; > - MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above. > - assert(mdr->is_auth_pinned(in)); > - if (lock->is_stable()) > - in->auth_pin(lock); > - lock->set_state(LOCK_XLOCK); > - if (lock == &in->filelock) > - in->loner_cap = -1; > - lock->get_xlock(mdr, mdr->get_client()); > - mdr->xlocks.insert(lock); > - mdr->locks.insert(lock); > - } > - } > - // wrlock(s)? > - if (strong->wrlocked_inodes.count(in->vino())) { > - for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin(); > - r != strong->wrlocked_inodes[in->vino()].end(); > - ++r) { > - SimpleLock *lock = in->get_lock(r->first); > - dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl; > - MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above. > - assert(mdr->is_auth_pinned(in)); > - lock->set_state(LOCK_LOCK); > - if (lock == &in->filelock) > - in->loner_cap = -1; > - lock->get_wrlock(true); > - mdr->wrlocks.insert(lock); > - mdr->locks.insert(lock); > - } > + if (dnl->is_primary()) { > + if (q->second.is_primary()) { > + if (!(vinodeno_t(q->second.ino, q->first.snapid) == dnl->get_inode()->vino())) { Maybe it's worth adding an operator!= for vinodeno_t, since you seem to use this a couple times. > + // the survivor missed MDentryUnlink+MDentryLink messages ? > + assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); > + CInode *in = get_inode(q->second.ino, q->first.snapid); > + assert(in); > + rejoin_unlinked_inodes[from].insert(in); > + dout(7) << " sender has primary dentry but wrong inode" << dendl; > } > } else { > - dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl; > + // the survivor missed MDentryLink message ? > + assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); > + dout(7) << " sender doesn't primay dentry" << dendl; doesn't have primary? or something else? > + } > + } else { > + if (q->second.is_primary()) { > + // the survivor missed MDentryUnlink message ? > + CInode *in = get_inode(q->second.ino, q->first.snapid); > + assert(in); > + rejoin_unlinked_inodes[from].insert(in); > + dout(7) << " sender has primary dentry but we don't" << dendl; > } > - > - in->add_replica(from, p->second.nonce); > - dout(10) << " have " << *in << dendl; > } > } > } > > - // base inodes? (root, stray, etc.) > - for (set<vinodeno_t>::iterator p = strong->weak_inodes.begin(); > - p != strong->weak_inodes.end(); > + for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin(); > + p != strong->strong_inodes.end(); > ++p) { > - CInode *in = get_inode(*p); > - dout(10) << " have base " << *in << dendl; > - in->add_replica(from); > + CInode *in = get_inode(p->first); > + assert(in); > + in->add_replica(from, p->second.nonce); > + dout(10) << " have " << *in << dendl; > + > + MMDSCacheRejoin::inode_strong &is = p->second; > + > + // caps_wanted > + if (is.caps_wanted) { > + in->mds_caps_wanted[from] = is.caps_wanted; > + dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) > + << " on " << *in << dendl; > + } > + > + // scatterlocks? > + // infer state from replica state: > + // * go to MIX if they might have wrlocks > + // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock) > + in->filelock.infer_state_from_strong_rejoin(is.filelock, true); // maybe also go to LOCK > + in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false); > + in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false); > + > + // auth pin? > + if (strong->authpinned_inodes.count(in->vino())) { > + MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()]; > + dout(10) << " inode authpin by " << r << " on " << *in << dendl; > + > + // get/create slave mdrequest > + MDRequest *mdr; > + if (have_request(r.reqid)) > + mdr = request_get(r.reqid); > + else > + mdr = request_start_slave(r.reqid, r.attempt, from); > + if (strong->frozen_authpin_inodes.count(in->vino())) { > + assert(!in->get_num_auth_pins()); > + mdr->freeze_auth_pin(in); > + } else { > + assert(!in->is_frozen_auth_pin()); > + } > + mdr->auth_pin(in); > + } > + // xlock(s)? > + if (strong->xlocked_inodes.count(in->vino())) { > + for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin(); > + q != strong->xlocked_inodes[in->vino()].end(); > + ++q) { > + SimpleLock *lock = in->get_lock(q->first); > + dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl; > + MDRequest *mdr = request_get(q->second.reqid); // should have this from auth_pin above. > + assert(mdr->is_auth_pinned(in)); > + if (lock->is_stable()) > + in->auth_pin(lock); > + lock->set_state(LOCK_XLOCK); > + if (lock == &in->filelock) > + in->loner_cap = -1; > + lock->get_xlock(mdr, mdr->get_client()); > + mdr->xlocks.insert(lock); > + mdr->locks.insert(lock); > + } > + } > + // wrlock(s)? > + if (strong->wrlocked_inodes.count(in->vino())) { > + for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin(); > + q != strong->wrlocked_inodes[in->vino()].end(); > + ++q) { > + SimpleLock *lock = in->get_lock(q->first); > + dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl; > + MDRequest *mdr = request_get(q->second.reqid); // should have this from auth_pin above. > + assert(mdr->is_auth_pinned(in)); > + lock->set_state(LOCK_LOCK); > + if (lock == &in->filelock) > + in->loner_cap = -1; > + lock->get_wrlock(true); > + mdr->wrlocks.insert(lock); > + mdr->locks.insert(lock); > + } > + } > } > > - > + // unlinked inodes should be in stray > + for (set<CInode*>::iterator p = rejoin_unlinked_inodes[from].begin(); > + p != rejoin_unlinked_inodes[from].end(); > + ++p) { > + CInode *in = *p; > + dout(7) << " unlinked inode " << *in << dendl; > + assert(in->get_parent_dn()); > + assert(in->is_replica(from)); > + } I'm not clear on why we need to check this here — the previous for loop wasn't adding any inodes to the cache, so shouldn't we just check these conditions as we add them? > > // done? > assert(rejoin_gather.count(from)); > @@ -4448,6 +4470,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) > dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; > int from = ack->get_source().num(); > > + // for sending cache expire message > + list<CInode*> isolated_inodes; > + > // dirs > for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin(); > p != ack->strong_dirfrags.end(); > @@ -4455,7 +4480,29 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) > // we may have had incorrect dir fragmentation; refragment based > // on what they auth tells us. > CDir *dir = get_force_dirfrag(p->first); > - assert(dir); > + if (!dir) { > + CInode *diri = get_inode(p->first.ino); > + if (!diri) { > + // barebones inode; the full inode loop below will clean up. > + diri = new CInode(this, false); > + diri->inode.ino = p->first.ino; > + diri->inode.mode = S_IFDIR; > + if (MDS_INO_MDSDIR(p->first.ino)) { > + diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN); > + add_inode(diri); > + dout(10) << " add inode " << *diri << dendl; > + } else { > + diri->inode_auth = CDIR_AUTH_UNDEF; > + isolated_inodes.push_back(diri); > + dout(10) << " unconnected dirfrag " << p->first << dendl; > + } > + } > + // barebones dirfrag; the full dirfrag loop below will clean up. > + dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false)); > + if (dir->authority().first != from) > + adjust_subtree_auth(dir, from); > + dout(10) << " add dirfrag " << *dir << dendl; > + } > > dir->set_replica_nonce(p->second.nonce); > dir->state_clear(CDir::STATE_REJOINING); > @@ -4467,7 +4514,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) > q != dmap.end(); > ++q) { > CDentry *dn = dir->lookup(q->first.name, q->first.snapid); > - assert(dn); > + if(!dn) > + dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid); > + > CDentry::linkage_t *dnl = dn->get_linkage(); > > assert(dn->last == q->first.snapid); > @@ -4476,33 +4525,48 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) > dn->first = q->second.first; > } > > + // may have bad linkage if we missed dentry link/unlink messages > + if (dnl->is_primary()) { > + CInode *in = dnl->get_inode(); > + if (!q->second.is_primary() || > + !(vinodeno_t(q->second.ino, q->first.snapid) == in->vino())) { > + dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl; > + dir->unlink_inode(dn); > + } > + } else if (dnl->is_remote()) { > + if (!q->second.is_remote() || > + q->second.remote_ino != dnl->get_remote_ino() || > + q->second.remote_d_type != dnl->get_remote_d_type()) { > + dout(10) << " had bad linkage for " << *dn << dendl; > + dir->unlink_inode(dn); > + } > + } else { > + if (!q->second.is_null()) > + dout(10) << " had bad linkage for " << *dn << dendl; > + } > + > // hmm, did we have the proper linkage here? > - if (dnl->is_null() && > - !q->second.is_null()) { > - dout(10) << " had bad (missing) linkage for " << *dn << dendl; > + if (dnl->is_null() && !q->second.is_null()) { > if (q->second.is_remote()) { > dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type); > } else { > CInode *in = get_inode(q->second.ino, q->first.snapid); > - assert(in == 0); // a rename would have been caught be the resolve stage. > - // barebones inode; the full inode loop below will clean up. > - in = new CInode(this, false, q->second.first, q->first.snapid); > - in->inode.ino = q->second.ino; > - add_inode(in); > + if (!in) { > + // barebones inode; assume it's dir, the full inode loop below will clean up. > + in = new CInode(this, false, q->second.first, q->first.snapid); > + in->inode.ino = q->second.ino; > + in->inode.mode = S_IFDIR; > + add_inode(in); > + dout(10) << " add inode " << *in << dendl; > + } else if (in->get_parent_dn()) { > + dout(10) << " had bad linkage for " << *(in->get_parent_dn()) > + << ", unlinking " << *in << dendl; > + in->get_parent_dir()->unlink_inode(in->get_parent_dn()); > + } > dn->dir->link_primary_inode(dn, in); > } > } > - else if (!dnl->is_null() && > - q->second.is_null()) { > - dout(0) << " had bad linkage for " << *dn << dendl; > - /* > - * this should happen: > - * if we're a survivor, any unlink should commit or rollback during > - * the resolve stage. > - * if we failed, we shouldn't have non-auth leaf dentries at all > - */ > - assert(0); // uh oh. > - } > + > dn->set_replica_nonce(q->second.nonce); > dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters); > dn->state_clear(CDentry::STATE_REJOINING); > @@ -4564,6 +4628,21 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) > dout(10) << " got inode locks " << *in << dendl; > } > > + // trim unconnected subtree > + if (!isolated_inodes.empty()) { > + map<int, MCacheExpire*> expiremap; > + for (list<CInode*>::iterator p = isolated_inodes.begin(); > + p != isolated_inodes.end(); > + ++p) { > + list<CDir*> ls; > + (*p)->get_dirfrags(ls); > + trim_dirfrag(*ls.begin(), 0, expiremap); > + assert((*p)->get_num_ref() == 0); > + delete *p; > + } > + send_expire_messages(expiremap); > + } > + > // done? > assert(rejoin_ack_gather.count(from)); > rejoin_ack_gather.erase(from); > @@ -5164,6 +5243,37 @@ void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snap > void MDCache::rejoin_send_acks() > { > dout(7) << "rejoin_send_acks" << dendl; > + > + // replicate stray > + for (map<int, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin(); > + p != rejoin_unlinked_inodes.end(); > + ++p) { > + for (set<CInode*>::iterator q = p->second.begin(); > + q != p->second.end(); > + ++q) { > + CInode *in = *q; > + dout(7) << " unlinked inode " << *in << dendl; > + // inode expired > + if (!in->is_replica(p->first)) > + continue; > + while (1) { > + CDentry *dn = in->get_parent_dn(); > + if (dn->is_replica(p->first)) > + break; > + dn->add_replica(p->first); > + CDir *dir = dn->get_dir(); > + if (dir->is_replica(p->first)) > + break; > + dir->add_replica(p->first); > + in = dir->get_inode(); > + if (in->is_replica(p->first)) > + break; > + if (in->is_base()) > + break; > + } > + } > + } > + rejoin_unlinked_inodes.clear(); > > // send acks to everyone in the recovery set > map<int,MMDSCacheRejoin*> ack; > @@ -5203,23 +5313,29 @@ void MDCache::rejoin_send_acks() > CDentry *dn = q->second; > CDentry::linkage_t *dnl = dn->get_linkage(); > > + // inode > + CInode *in = NULL; > + if (dnl->is_primary()) > + in = dnl->get_inode(); > + > // dentry > for (map<int,int>::iterator r = dn->replicas_begin(); > r != dn->replicas_end(); > - ++r) > + ++r) { > ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last, > dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), > dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), > dnl->is_remote() ? dnl->get_remote_d_type():0, > ++r->second, > dn->lock.get_replica_state()); > + // peer missed MDentrylink message ? > + if (in && !in->is_replica(r->first)) > + in->add_replica(r->first); > + } > > - if (!dnl->is_primary()) > + if (!in) > continue; > > - // inode > - CInode *in = dnl->get_inode(); > - > for (map<int,int>::iterator r = in->replicas_begin(); > r != in->replicas_end(); > ++r) { > diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h > index 85f5d65..09cc092 100644 > --- a/src/mds/MDCache.h > +++ b/src/mds/MDCache.h > @@ -416,6 +416,7 @@ protected: > set<CInode*> rejoin_undef_inodes; > set<CInode*> rejoin_potential_updated_scatterlocks; > set<CDir*> rejoin_undef_dirfrags; > + map<int, set<CInode*> > rejoin_unlinked_inodes; > > vector<CInode*> rejoin_recover_q, rejoin_check_q; > list<Context*> rejoin_waiters; > -- > 1.7.11.7 > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html