[PATCH] mds: sort dentries when committing dir fragment

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx>

Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's
string representation is used as key for the tmap. When writing or
updating a tmap, the OSDs expect the keys to be provided in ascending
order. Current code encodes dentries by the order of dentry_key_t(s)
when committing dir fragment. The problem here is that we may get
different results when comparing dentry_key_t(s) and their string
representations. So the MDS may send data/commands sorted in the
wrong order to the OSDs. It confuses the OSDs and causes corruption.

Comparing dentry_key_t(s) and their string representations gives
different results only when name string in one dentry_key_t is prefix
of name string in another dentry_key_t. So the fix is checking the
special case and re-sorting dentries that are in the wrong order.

Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx>
---
 src/mds/CDir.cc    | 154 ++++++++++++++++++++++++++++++++++++++++++-----------
 src/mds/mdstypes.h |  16 +++---
 2 files changed, 130 insertions(+), 40 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 2b4f7c7..e724e61 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1720,24 +1720,65 @@ CDir::map_t::iterator CDir::_commit_full(ObjectOperation& m, const set<snapid_t>
   ::encode(fnode, header);
   max_write_size -= header.length();
 
+  /*
+   * We may get different results when comparing dentry_key_t(s) and their
+   * string representations. It happens only when name string in one dentry_key_t
+   * is prefix of name string in another dentry_key_t. Tmap uses dentry_key_t's
+   * string representation as key. When writing or updating a tmap, the osd
+   * expects the keys to be provided in ascending order. So we need re-sort
+   * the dentries here.
+   */
+  map<string, CDentry*> pending_items;
+
   map_t::iterator p = items.begin();
-  while (p != items.end() && bl.length() < max_write_size) {
-    CDentry *dn = p->second;
-    p++;
-    
-    if (dn->linkage.is_null()) 
-      continue;  // skip negative entries
+  while ((p != items.end() || pending_items.size()) && bl.length() < max_write_size) {
+    if (p != items.end()) {
+      CDentry *dn = p->second;
+      p++;
 
-    if (snaps && dn->last != CEPH_NOSNAP &&
-	try_trim_snap_dentry(dn, *snaps))
-      continue;
-    
-    n++;
+      if (dn->linkage.is_null())
+	continue;  // skip negative entries
+
+      if (snaps && dn->last != CEPH_NOSNAP &&
+	  try_trim_snap_dentry(dn, *snaps))
+	continue;
+
+      n++;
+
+      if (pending_items.empty()) {
+	int len = 0;
+	if (p != items.end())
+	  len = min(dn->name.length(), p->second->name.length());
+	if (p == items.end() || dn->name.compare(0, len, p->second->name, 0, len) < 0) {
+	  _encode_dentry(dn, bl, snaps);
+	} else {
+	  pending_items[dn->key().str()] = dn;
+	}
+	continue;
+      }
+
+      pending_items[dn->key().str()] = dn;
+      if (p != items.end()) {
+	string& last_pending = pending_items.rbegin()->second->name;
+	int len = min(last_pending.length(), p->second->name.length());
+	if (last_pending.compare(0, len, p->second->name, 0, len) >= 0)
+	  continue;
+      }
+    }
+
+    for (map<string, CDentry*>::iterator it = pending_items.begin();
+	 it != pending_items.end(); it++) {
+      CDentry *dn = it->second;
+      _encode_dentry(dn, bl, snaps);
+    }
 
-    _encode_dentry(dn, bl, snaps);
+    if (bl.length() > max_write_size)
+      break;
+
+    pending_items.clear();
   }
 
-  if (p != items.end()) {
+  if (p != items.end() || pending_items.size()) {
     assert(bl.length() > max_write_size);
     return _commit_partial(m, snaps, max_write_size);
   }
@@ -1790,31 +1831,82 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation& m,
   if(last_committed_dn != map_t::iterator())
     p = last_committed_dn;
 
-  while (p != items.end() && finalbl.length() < max_write_size) {
-    CDentry *dn = p->second;
-    ++p;
-    
-    if (snaps && dn->last != CEPH_NOSNAP &&
-	try_trim_snap_dentry(dn, *snaps))
-      continue;
+  // see comments in _commit_full()
+  map_t::iterator next_dn = p;
+  map<string, CDentry*> pending_items;
 
-    if (!dn->is_dirty())
-      continue;  // skip clean dentries
+  while ((p != items.end() || pending_items.size()) && finalbl.length() < max_write_size) {
+    if (p != items.end()) {
+      CDentry *dn = p->second;
+      ++p;
 
-    if (dn->get_linkage()->is_null()) {
-      dout(10) << " rm " << dn->name << " " << *dn << dendl;
-      finalbl.append(CEPH_OSD_TMAP_RM);
-      dn->key().encode(finalbl);
-    } else {
-      dout(10) << " set " << dn->name << " " << *dn << dendl;
-      finalbl.append(CEPH_OSD_TMAP_SET);
-      _encode_dentry(dn, finalbl, snaps);
+      if (snaps && dn->last != CEPH_NOSNAP &&
+	  try_trim_snap_dentry(dn, *snaps))
+	continue;
+
+      if (!dn->is_dirty())
+	continue;  // skip clean dentries
+
+      if (pending_items.empty()) {
+	int len = 0;
+	if (p != items.end())
+	  len = min(dn->name.length(), p->second->name.length());
+	if (p == items.end() || dn->name.compare(0, len, p->second->name, 0, len) < 0) {
+	  if (dn->get_linkage()->is_null()) {
+	    dout(10) << " rm " << dn->name << " " << *dn << dendl;
+	    finalbl.append(CEPH_OSD_TMAP_RM);
+	    dn->key().encode(finalbl);
+	  } else {
+	    dout(10) << " set " << dn->name << " " << *dn << dendl;
+	    finalbl.append(CEPH_OSD_TMAP_SET);
+	    _encode_dentry(dn, finalbl, snaps);
+	  }
+	  next_dn = p;
+	} else {
+	  pending_items[dn->key().str()] = dn;
+	}
+	continue;
+      }
+
+      pending_items[dn->key().str()] = dn;
+      if (p != items.end()) {
+	string& last_pending = pending_items.rbegin()->second->name;
+	int len = min(last_pending.length(), p->second->name.length());
+	if (last_pending.compare(0, len, p->second->name, 0, len) >= 0)
+	  continue;
+      }
+    }
+
+    bufferlist bl;
+    for (map<string, CDentry*>::iterator it = pending_items.begin();
+	 it != pending_items.end(); it++) {
+      CDentry *dn = it->second;
+      if (dn->get_linkage()->is_null()) {
+	dout(10) << " rm " << dn->name << " " << *dn << dendl;
+	bl.append(CEPH_OSD_TMAP_RM);
+	dn->key().encode(bl);
+      } else {
+	dout(10) << " set " << dn->name << " " << *dn << dendl;
+	bl.append(CEPH_OSD_TMAP_SET);
+	_encode_dentry(dn, bl, snaps);
+      }
     }
+
+    if (bl.length() + finalbl.length() > max_write_size)
+      break;
+
+    pending_items.clear();
+
+    finalbl.claim_append(bl);
+    next_dn = p;
   }
 
+  if (p == items.end() && pending_items.empty())
+    next_dn = p;
+
   // update the trivialmap at the osd
   m.tmap_update(finalbl);
-  return p;
+  return next_dn;
 }
 
 void CDir::_encode_dentry(CDentry *dn, bufferlist& bl,
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 22e754e..9c329a1 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -662,20 +662,18 @@ struct dentry_key_t {
   // encode into something that can be decoded as a string.
   // name_ (head) or name_%x (!head)
   void encode(bufferlist& bl) const {
-    __u32 l = strlen(name) + 1;
+    ::encode(str(), bl);
+  }
+  string str() const {
+    string str = name;
     char b[20];
     if (snapid != CEPH_NOSNAP) {
       uint64_t val(snapid);
-      snprintf(b, sizeof(b), "%" PRIx64, val);
-      l += strlen(b);
+      snprintf(b, sizeof(b), "_%" PRIx64, val);
     } else {
-      snprintf(b, sizeof(b), "%s", "head");
-      l += 4;
+      snprintf(b, sizeof(b), "_%s", "head");
     }
-    ::encode(l, bl);
-    bl.append(name, strlen(name));
-    bl.append("_", 1);
-    bl.append(b);
+    return str + b;
   }
   static void decode_helper(bufferlist::iterator& bl, string& nm, snapid_t& sn) {
     string foo;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux