Re: [PATCH] mds: sort dentries when committing dir fragment

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I pushed an alternative approach to wip-tmap.

This sorting is an artifact of tmap's crummy implementation, and the mds 
workaround will need to get reverted when we switch to omap.  Instead, fix 
tmap so that it will tolerate unsorted keys.  (Also, drop the ENOENT on rm 
on missing key.)

Eventually we can deprecate and remove tmap entirely...

What do you think?
sage


On Sun, 25 Nov 2012, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx>
> 
> Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's
> string representation is used as key for the tmap. When writing or
> updating a tmap, the OSDs expect the keys to be provided in ascending
> order. Current code encodes dentries by the order of dentry_key_t(s)
> when committing dir fragment. The problem here is that we may get
> different results when comparing dentry_key_t(s) and their string
> representations. So the MDS may send data/commands sorted in the
> wrong order to the OSDs. It confuses the OSDs and causes corruption.
> 
> Comparing dentry_key_t(s) and their string representations gives
> different results only when name string in one dentry_key_t is prefix
> of name string in another dentry_key_t. So the fix is checking the
> special case and re-sorting dentries that are in the wrong order.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx>
> ---
>  src/mds/CDir.cc    | 154 ++++++++++++++++++++++++++++++++++++++++++-----------
>  src/mds/mdstypes.h |  16 +++---
>  2 files changed, 130 insertions(+), 40 deletions(-)
> 
> diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
> index 2b4f7c7..e724e61 100644
> --- a/src/mds/CDir.cc
> +++ b/src/mds/CDir.cc
> @@ -1720,24 +1720,65 @@ CDir::map_t::iterator CDir::_commit_full(ObjectOperation& m, const set<snapid_t>
>    ::encode(fnode, header);
>    max_write_size -= header.length();
>  
> +  /*
> +   * We may get different results when comparing dentry_key_t(s) and their
> +   * string representations. It happens only when name string in one dentry_key_t
> +   * is prefix of name string in another dentry_key_t. Tmap uses dentry_key_t's
> +   * string representation as key. When writing or updating a tmap, the osd
> +   * expects the keys to be provided in ascending order. So we need re-sort
> +   * the dentries here.
> +   */
> +  map<string, CDentry*> pending_items;
> +
>    map_t::iterator p = items.begin();
> -  while (p != items.end() && bl.length() < max_write_size) {
> -    CDentry *dn = p->second;
> -    p++;
> -    
> -    if (dn->linkage.is_null()) 
> -      continue;  // skip negative entries
> +  while ((p != items.end() || pending_items.size()) && bl.length() < max_write_size) {
> +    if (p != items.end()) {
> +      CDentry *dn = p->second;
> +      p++;
>  
> -    if (snaps && dn->last != CEPH_NOSNAP &&
> -	try_trim_snap_dentry(dn, *snaps))
> -      continue;
> -    
> -    n++;
> +      if (dn->linkage.is_null())
> +	continue;  // skip negative entries
> +
> +      if (snaps && dn->last != CEPH_NOSNAP &&
> +	  try_trim_snap_dentry(dn, *snaps))
> +	continue;
> +
> +      n++;
> +
> +      if (pending_items.empty()) {
> +	int len = 0;
> +	if (p != items.end())
> +	  len = min(dn->name.length(), p->second->name.length());
> +	if (p == items.end() || dn->name.compare(0, len, p->second->name, 0, len) < 0) {
> +	  _encode_dentry(dn, bl, snaps);
> +	} else {
> +	  pending_items[dn->key().str()] = dn;
> +	}
> +	continue;
> +      }
> +
> +      pending_items[dn->key().str()] = dn;
> +      if (p != items.end()) {
> +	string& last_pending = pending_items.rbegin()->second->name;
> +	int len = min(last_pending.length(), p->second->name.length());
> +	if (last_pending.compare(0, len, p->second->name, 0, len) >= 0)
> +	  continue;
> +      }
> +    }
> +
> +    for (map<string, CDentry*>::iterator it = pending_items.begin();
> +	 it != pending_items.end(); it++) {
> +      CDentry *dn = it->second;
> +      _encode_dentry(dn, bl, snaps);
> +    }
>  
> -    _encode_dentry(dn, bl, snaps);
> +    if (bl.length() > max_write_size)
> +      break;
> +
> +    pending_items.clear();
>    }
>  
> -  if (p != items.end()) {
> +  if (p != items.end() || pending_items.size()) {
>      assert(bl.length() > max_write_size);
>      return _commit_partial(m, snaps, max_write_size);
>    }
> @@ -1790,31 +1831,82 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation& m,
>    if(last_committed_dn != map_t::iterator())
>      p = last_committed_dn;
>  
> -  while (p != items.end() && finalbl.length() < max_write_size) {
> -    CDentry *dn = p->second;
> -    ++p;
> -    
> -    if (snaps && dn->last != CEPH_NOSNAP &&
> -	try_trim_snap_dentry(dn, *snaps))
> -      continue;
> +  // see comments in _commit_full()
> +  map_t::iterator next_dn = p;
> +  map<string, CDentry*> pending_items;
>  
> -    if (!dn->is_dirty())
> -      continue;  // skip clean dentries
> +  while ((p != items.end() || pending_items.size()) && finalbl.length() < max_write_size) {
> +    if (p != items.end()) {
> +      CDentry *dn = p->second;
> +      ++p;
>  
> -    if (dn->get_linkage()->is_null()) {
> -      dout(10) << " rm " << dn->name << " " << *dn << dendl;
> -      finalbl.append(CEPH_OSD_TMAP_RM);
> -      dn->key().encode(finalbl);
> -    } else {
> -      dout(10) << " set " << dn->name << " " << *dn << dendl;
> -      finalbl.append(CEPH_OSD_TMAP_SET);
> -      _encode_dentry(dn, finalbl, snaps);
> +      if (snaps && dn->last != CEPH_NOSNAP &&
> +	  try_trim_snap_dentry(dn, *snaps))
> +	continue;
> +
> +      if (!dn->is_dirty())
> +	continue;  // skip clean dentries
> +
> +      if (pending_items.empty()) {
> +	int len = 0;
> +	if (p != items.end())
> +	  len = min(dn->name.length(), p->second->name.length());
> +	if (p == items.end() || dn->name.compare(0, len, p->second->name, 0, len) < 0) {
> +	  if (dn->get_linkage()->is_null()) {
> +	    dout(10) << " rm " << dn->name << " " << *dn << dendl;
> +	    finalbl.append(CEPH_OSD_TMAP_RM);
> +	    dn->key().encode(finalbl);
> +	  } else {
> +	    dout(10) << " set " << dn->name << " " << *dn << dendl;
> +	    finalbl.append(CEPH_OSD_TMAP_SET);
> +	    _encode_dentry(dn, finalbl, snaps);
> +	  }
> +	  next_dn = p;
> +	} else {
> +	  pending_items[dn->key().str()] = dn;
> +	}
> +	continue;
> +      }
> +
> +      pending_items[dn->key().str()] = dn;
> +      if (p != items.end()) {
> +	string& last_pending = pending_items.rbegin()->second->name;
> +	int len = min(last_pending.length(), p->second->name.length());
> +	if (last_pending.compare(0, len, p->second->name, 0, len) >= 0)
> +	  continue;
> +      }
> +    }
> +
> +    bufferlist bl;
> +    for (map<string, CDentry*>::iterator it = pending_items.begin();
> +	 it != pending_items.end(); it++) {
> +      CDentry *dn = it->second;
> +      if (dn->get_linkage()->is_null()) {
> +	dout(10) << " rm " << dn->name << " " << *dn << dendl;
> +	bl.append(CEPH_OSD_TMAP_RM);
> +	dn->key().encode(bl);
> +      } else {
> +	dout(10) << " set " << dn->name << " " << *dn << dendl;
> +	bl.append(CEPH_OSD_TMAP_SET);
> +	_encode_dentry(dn, bl, snaps);
> +      }
>      }
> +
> +    if (bl.length() + finalbl.length() > max_write_size)
> +      break;
> +
> +    pending_items.clear();
> +
> +    finalbl.claim_append(bl);
> +    next_dn = p;
>    }
>  
> +  if (p == items.end() && pending_items.empty())
> +    next_dn = p;
> +
>    // update the trivialmap at the osd
>    m.tmap_update(finalbl);
> -  return p;
> +  return next_dn;
>  }
>  
>  void CDir::_encode_dentry(CDentry *dn, bufferlist& bl,
> diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
> index 22e754e..9c329a1 100644
> --- a/src/mds/mdstypes.h
> +++ b/src/mds/mdstypes.h
> @@ -662,20 +662,18 @@ struct dentry_key_t {
>    // encode into something that can be decoded as a string.
>    // name_ (head) or name_%x (!head)
>    void encode(bufferlist& bl) const {
> -    __u32 l = strlen(name) + 1;
> +    ::encode(str(), bl);
> +  }
> +  string str() const {
> +    string str = name;
>      char b[20];
>      if (snapid != CEPH_NOSNAP) {
>        uint64_t val(snapid);
> -      snprintf(b, sizeof(b), "%" PRIx64, val);
> -      l += strlen(b);
> +      snprintf(b, sizeof(b), "_%" PRIx64, val);
>      } else {
> -      snprintf(b, sizeof(b), "%s", "head");
> -      l += 4;
> +      snprintf(b, sizeof(b), "_%s", "head");
>      }
> -    ::encode(l, bl);
> -    bl.append(name, strlen(name));
> -    bl.append("_", 1);
> -    bl.append(b);
> +    return str + b;
>    }
>    static void decode_helper(bufferlist::iterator& bl, string& nm, snapid_t& sn) {
>      string foo;
> -- 
> 1.7.11.7
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux