Re: cephfs: Normal user of our fs can damage the whole system by writing huge xattr kv pairs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Apr 12, 2017 at 3:46 AM, Yang Joseph <joseph.yang@xxxxxxxxxxxx> wrote:
> hello,
>
> Normal user of our fs can damage the whole system by writing huge xattr kv
> pairs. I think this is a serious bug. Your quick reply can help me to fix
> this bug as soon as possible. Thank you for your time to review this patch.

Apologies for the delay on this one.  I've commented on the PR.

Thanks,
John

>
> - Yang Honggang
>
> -------- Forwarded Message --------
> Subject:        cephfs: fix write_buf's _len overflow problem
> Date:   Thu, 23 Feb 2017 13:40:41 +0800
> From:   Yang Joseph <joseph.yang@xxxxxxxxxxxx>
> To:     john.spray@xxxxxxxxxx
> CC:     ceph-devel <ceph-devel@xxxxxxxxxxxxxxx>, peng.hse
> <peng.hse@xxxxxxxxxxxx>
>
>
>
> Hello,
>
> After I have set about 400 64KB xattr kv pairs to a file,
> mds is crashed. Every time I try to start mds, it will crash again.
> The root reason is write_buf._len overflowed when doing
> Journaler::append_entry().
>
> my suggestions:
>
> |1. limit file/dir's xattr size 2. throttle journal entry append
> operations 3. add bufferlist _len overflow checking|
>
>
> *bug analysis*: http://tracker.ceph.com/issues/19033
>
> *proposed fix*: https://github.com/ceph/ceph/pull/13587
>
> Looking forward to your reply and comments.
>
> Thx,
>
> Yang Honggang
>
> -------------- patch begin // master branch -----
> diff --git a/src/common/buffer.cc b/src/common/buffer.cc
> index 883e713..99b380d 100644
> --- a/src/common/buffer.cc
> +++ b/src/common/buffer.cc
> @@ -984,6 +984,7 @@ static simple_spinlock_t buffer_debug_lock =
> SIMPLE_SPINLOCK_INITIALIZER;
>       char* ptr = _raw->data + _off + _len;
>       *ptr = c;
> _len++;
> +    assert(_len != 0);
>       return _len + _off;
> }
>
> @@ -994,6 +995,7 @@ static simple_spinlock_t buffer_debug_lock =
> SIMPLE_SPINLOCK_INITIALIZER;
>       char* c = _raw->data + _off + _len;
>       maybe_inline_memcpy(c, p, l, 32);
>       _len += l;
> +    assert(_len >= l);
>       return _len + _off;
>     }
>
> @@ -1707,6 +1709,7 @@ static simple_spinlock_t buffer_debug_lock =
> SIMPLE_SPINLOCK_INITIALIZER;
>     {
>       // steal the other guy's buffers
>       _len += bl._len;
> +    assert(_len >= bl._len);
>       if (!(flags & CLAIM_ALLOW_NONSHAREABLE))
>         bl.make_shareable();
>       _buffers.splice(_buffers.end(), bl._buffers );
> @@ -1718,6 +1721,7 @@ static simple_spinlock_t buffer_debug_lock =
> SIMPLE_SPINLOCK_INITIALIZER;
>     {
>       // steal the other guy's buffers
>       _len += bl._len;
> +    assert(_len >= bl._len);
>       if (!(flags & CLAIM_ALLOW_NONSHAREABLE))
>         bl.make_shareable();
>       _buffers.splice(_buffers.begin(), bl._buffers );
> @@ -1832,6 +1836,7 @@ static simple_spinlock_t buffer_debug_lock =
> SIMPLE_SPINLOCK_INITIALIZER;
>          // yay contiguous with tail bp!
>          l.set_length(l.length()+len);
>          _len += len;
> +        assert(_len >= len);
>          return;
>         }
>       }
> @@ -1842,6 +1847,7 @@ static simple_spinlock_t buffer_debug_lock =
> SIMPLE_SPINLOCK_INITIALIZER;
>     void buffer::list::append(const list& bl)
>     {
>       _len += bl._len;
> +    assert(_len >= bl._len);
>       for (std::list<ptr>::const_iterator p = bl._buffers.begin();
>           p != bl._buffers.end();
>           ++p)
> @@ -2038,6 +2044,7 @@ static simple_spinlock_t buffer_debug_lock =
> SIMPLE_SPINLOCK_INITIALIZER;
>         //cout << "keeping front " << off << " of " << *curbuf << std::endl;
>         _buffers.insert( curbuf, ptr( *curbuf, 0, off ) );
>         _len += off;
> +      assert(_len >= off);
>       }
>
>       while (len > 0) {
> diff --git a/src/common/config_opts.h b/src/common/config_opts.h
> index 3a27dbe..01336e5 100644
> --- a/src/common/config_opts.h
> +++ b/src/common/config_opts.h
> @@ -485,6 +485,7 @@ OPTION(journaler_batch_interval, OPT_DOUBLE, .001)
> // seconds.. max add latenc
>   OPTION(journaler_batch_max, OPT_U64, 0)  // max bytes we'll delay
> flushing; disable, for now....
>   OPTION(mds_data, OPT_STR, "/var/lib/ceph/mds/$cluster-$id")
>   OPTION(mds_max_file_size, OPT_U64, 1ULL << 40) // Used when creating
> new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
> +OPTION(mds_max_xattr_pairs_size, OPT_U32, 1 << 16) // max xattr kv
> pairs size for each dir/file
>   OPTION(mds_cache_size, OPT_INT, 100000)
>   OPTION(mds_cache_mid, OPT_FLOAT, .7)
>   OPTION(mds_max_file_recover, OPT_U32, 32)
> diff --git a/src/include/buffer.h b/src/include/buffer.h
> index a016bab..a997b4e 100644
> --- a/src/include/buffer.h
> +++ b/src/include/buffer.h
> @@ -729,12 +729,14 @@ namespace buffer CEPH_BUFFER_API {
>          return;
>         _buffers.push_front(bp);
>         _len += bp.length();
> +      assert(_len >= bp.length());
>       }
>       void push_front(ptr&& bp) {
>         if (bp.length() == 0)
>          return;
>         _len += bp.length();
>         _buffers.push_front(std::move(bp));
> +      assert(_len >= bp.length());
>       }
>       void push_front(raw *r) {
>         push_front(ptr(r));
> @@ -744,11 +746,13 @@ namespace buffer CEPH_BUFFER_API {
>          return;
>         _buffers.push_back(bp);
>         _len += bp.length();
> +      assert(_len >= bp.length());
>       }
>       void push_back(ptr&& bp) {
>         if (bp.length() == 0)
>          return;
>         _len += bp.length();
> +      assert(_len >= bp.length());
>         _buffers.push_back(std::move(bp));
>       }
>       void push_back(raw *r) {
> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> index 539b7a8..9ddf852 100644
> --- a/src/mds/Server.cc
> +++ b/src/mds/Server.cc
> @@ -3416,7 +3416,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
>       max = dir->get_num_any();  // whatever, something big.
>     unsigned max_bytes = req->head.args.readdir.max_bytes;
>     if (!max_bytes)
> -    max_bytes = 512 << 10;  // 512 KB?
> +    max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;  //
> make sure at least one item can be encoded
>
>     // start final blob
>     bufferlist dirbl;
> @@ -4535,6 +4535,25 @@ void Server::handle_client_setxattr(MDRequestRef&
> mdr)
>       return;
>
>     map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
> +  size_t len = req->get_data().length();
> +  size_t inc = len + name.length();
> +
> +  // check xattrs kv pairs size
> +  size_t cur_xattrs_size = 0;
> +  for (map<string, bufferptr>::iterator p = pxattrs->begin();
> +          p != pxattrs->end(); p++) {
> +    if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p->first) == 0)) {
> +      continue;
> +    }
> +    cur_xattrs_size += p->first.length() + p->second.length();
> +  }
> +  if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
> +    dout(10) << "xattr kv pairs size too big. cur_xattrs_size " <<
> cur_xattrs_size
> +                << ", inc " << inc << dendl;
> +    respond_to_request(mdr, -ENOSPC);
> +    return;
> +  }
> +
>     if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
>       dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on
> " << *cur << dendl;
>       respond_to_request(mdr, -EEXIST);
> @@ -4546,7 +4565,6 @@ void Server::handle_client_setxattr(MDRequestRef& mdr)
>       return;
>     }
>
> -  int len = req->get_data().length();
>     dout(10) << "setxattr '" << name << "' len " << len << " on " <<
> *cur << dendl;
>
>     // project update
> @@ -8025,7 +8043,7 @@ void Server::handle_client_lssnap(MDRequestRef& mdr)
>       max_entries = infomap.size();
>     int max_bytes = req->head.args.readdir.max_bytes;
>     if (!max_bytes)
> -    max_bytes = 512 << 10;
> +    max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;  //
> make sure at least one item can be encoded
>
>     __u64 last_snapid = 0;
>     string offset_str = req->get_path2();
> diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc
> index 8d2b33a..132d4cf 100644
> --- a/src/osdc/Journaler.cc
> +++ b/src/osdc/Journaler.cc
> @@ -537,7 +537,7 @@ void Journaler::_finish_flush(int r, uint64_t start,
> ceph::real_time stamp)
>
>   uint64_t Journaler::append_entry(bufferlist& bl)
>   {
> -  lock_guard l(lock);
> +  unique_lock l(lock);
>
>     assert(!readonly);
>     uint32_t s = bl.length();
> @@ -556,6 +556,13 @@ uint64_t Journaler::append_entry(bufferlist& bl)
>         bufferptr bp(write_pos - owp);
>         bp.zero();
>         assert(bp.length() >= 4);
> +      if (!write_buf_throttle.get_or_fail(bp.length())) {
> +        l.unlock();
> +        ldout(cct, 10) << "write_buf_throttle wait, bp len " <<
> bp.length() << dendl;
> +        write_buf_throttle.get(bp.length());
> +        l.lock();
> +      }
> +      ldout(cct, 20) << "write_buf_throttle get, bp len " <<
> bp.length() << dendl;
>         write_buf.push_back(bp);
>
>         // now flush.
> @@ -569,6 +576,14 @@ uint64_t Journaler::append_entry(bufferlist& bl)
>
>
>     // append
> +  size_t delta = bl.length() + journal_stream.get_envelope_size();
> +  if (!write_buf_throttle.get_or_fail(delta)) { // write_buf space is
> nearly full
> +    l.unlock();
> +    ldout(cct, 10) << "write_buf_throttle wait, delta " << delta << dendl;
> +    write_buf_throttle.get(delta);
> +    l.lock();
> +  }
> +  ldout(cct, 20) << "write_buf_throttle get, delta " << delta << dendl;
>     size_t wrote = journal_stream.write(bl, &write_buf, write_pos);
>     ldout(cct, 10) << "append_entry len " << s << " to " << write_pos << "~"
>                   << wrote << dendl;
> @@ -598,7 +613,7 @@ void Journaler::_do_flush(unsigned amount)
>     assert(!readonly);
>
>     // flush
> -  unsigned len = write_pos - flush_pos;
> +  uint64_t len = write_pos - flush_pos;
>     assert(len == write_buf.length());
>     if (amount && amount < len)
>       len = amount;
> @@ -654,7 +669,9 @@ void Journaler::_do_flush(unsigned amount)
>
>     flush_pos += len;
>     assert(write_buf.length() == write_pos - flush_pos);
> -
> +  write_buf_throttle.put(len);
> +  ldout(cct, 20) << "write_buf_throttle put, len " << len << dendl;
> +
>     ldout(cct, 10)
>       << "_do_flush (prezeroing/prezero)/write/flush/safe pointers now at "
>       << "(" << prezeroing_pos << "/" << prezero_pos << ")/" << write_pos
> diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
> index 6c7e7cf..3831b86 100644
> --- a/src/osdc/Journaler.h
> +++ b/src/osdc/Journaler.h
> @@ -64,7 +64,7 @@
>   #include "Filer.h"
>
>   #include "common/Timer.h"
> -
> +#include "common/Throttle.h"
>
>   class CephContext;
>   class Context;
> @@ -113,6 +113,13 @@ class JournalStream
>     bool readable(bufferlist &bl, uint64_t *need) const;
>     size_t read(bufferlist &from, bufferlist *to, uint64_t *start_ptr);
>     size_t write(bufferlist &entry, bufferlist *to, uint64_t const
> &start_ptr);
> +  size_t get_envelope_size() const {
> +     if (format >= JOURNAL_FORMAT_RESILIENT) {
> +       return JOURNAL_ENVELOPE_RESILIENT;
> +     } else {
> +       return JOURNAL_ENVELOPE_LEGACY;
> +     }
> +  }
>
>     // A magic number for the start of journal entries, so that we can
>     // identify them in damaged journals.
> @@ -295,6 +302,8 @@ private:
>     bufferlist write_buf; ///< write buffer.  flush_pos +
>                          ///  write_buf.length() == write_pos.
>
> +  Throttle write_buf_throttle; // protect write_buf from bufferlist
> _len overflow
> +
>     bool waiting_for_zero;
>     interval_set<uint64_t> pending_zero;  // non-contig bits we've zeroed
>     std::set<uint64_t> pending_safe;
> @@ -390,6 +399,7 @@ public:
>       timer(tim), delay_flush_event(0),
>       state(STATE_UNDEF), error(0),
>       prezeroing_pos(0), prezero_pos(0), write_pos(0), flush_pos(0),
> safe_pos(0),
> +    write_buf_throttle(cct, "write_buf_throttle", UINT_MAX - (UINT_MAX
>  >> 3)),
>       waiting_for_zero(false),
>       read_pos(0), requested_pos(0), received_pos(0),
>       fetch_len(0), temp_fetch_len(0),
> --------------- patch end ---------------------------
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux