[PATCH] osd: Do not subtract object overlaps from cache usage

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,
this is a followup of my question [1] which I partially answered by looking at
code history -- semantics of num_bytes isn't meant to include objects
overlapping areas. However, this results in underestimating of space
occupied on a cache tier proportional to the number of (overlapping) clone
objects stored there.

Could you please comment on the patch below if it's a right avenue to address
this misaccounting problem. (This is rather for a discussion, I plan to submit
a regular PR afterwards.)

Thanks,
Michal


[1] http://lists.ceph.com/pipermail/ceph-users-ceph.com/2017-April/017288.html

---

PG group field num_bytes does not include space that is part of clone
object overlaps. Introduce a new field num_bytes_overlap that stands for
this space only, so that it can be used to strictly limit cache tier
usage with object storage backends that do not support storing overlaps
efficiently.
---
 src/osd/PrimaryLogPG.cc | 15 ++++++++++++++-
 src/osd/PrimaryLogPG.h  |  1 +
 src/osd/osd_types.cc    |  9 ++++++++-
 src/osd/osd_types.h     |  6 +++++-
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 2f15999..12c02a0 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -6407,6 +6407,7 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
       ctx->delta_stats.num_objects_omap++;
     if (snap_oi->is_cache_pinned())
       ctx->delta_stats.num_objects_pinned++;
+    ctx->delta_stats.num_bytes_overlap += ctx->obs->oi.size;
     ctx->delta_stats.num_object_clones++;
     ctx->new_snapset.clones.push_back(coid.snap);
     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
@@ -6439,8 +6440,10 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
     if (is_present_clone(last_clone_oid)) {
       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
       ctx->modified_ranges.intersection_of(newest_overlap);
-      // modified_ranges is still in use by the clone
+      // modified_ranges is still in use by the clone (but not in accounted overlap)
       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
+      sub_interval_usage(ctx->modified_ranges, ctx->delta_stats);
+
       newest_overlap.subtract(ctx->modified_ranges);
     }
   }
@@ -6479,6 +6482,13 @@ void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum
   }
 }
 
+void PrimaryLogPG::sub_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
+{
+  for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
+    delta_stats.num_bytes_overlap -= p.get_len();
+  }
+}
+
 void PrimaryLogPG::complete_disconnect_watches(
   ObjectContextRef obc,
   const list<watch_disconnect_t> &to_disconnect)
@@ -12250,6 +12260,9 @@ bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
     num_user_objects = 0;
 
   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
+  // TODO should go depend on osd->store
+  uint64_t overlap_bytes = info.stats.stats.sum.num_bytes_overlap;
+  num_user_bytes += overlap_bytes;
   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
   num_user_bytes -= unflushable_bytes;
   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index 3d92e3b9..a802da1 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -1095,6 +1095,7 @@ protected:
 				   uint64_t length, bool count_bytes,
 				   bool force_changesize=false);
   void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
+  void sub_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
 
 
   enum class cache_result_t {
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 2cce17e..25aaad6 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1846,6 +1846,7 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
 void object_stat_sum_t::dump(Formatter *f) const
 {
   f->dump_int("num_bytes", num_bytes);
+  f->dump_int("num_bytes_overlap", num_bytes_overlap);
   f->dump_int("num_objects", num_objects);
   f->dump_int("num_object_clones", num_object_clones);
   f->dump_int("num_object_copies", num_object_copies);
@@ -1883,11 +1884,12 @@ void object_stat_sum_t::dump(Formatter *f) const
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(15, 3, bl);
+  ENCODE_START(16, 3, bl);
 #if defined(CEPH_LITTLE_ENDIAN)
   bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
 #else
   ::encode(num_bytes, bl);
+  ::encode(num_bytes_overlap, bl);
   ::encode(num_objects, bl);
   ::encode(num_object_clones, bl);
   ::encode(num_object_copies, bl);
@@ -1941,6 +1943,8 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl)
       uint64_t num_kb;
       ::decode(num_kb, bl);
     }
+    if (struct_v >= 16)
+      ::decode(num_bytes_overlap, bl);
     ::decode(num_objects, bl);
     ::decode(num_object_clones, bl);
     ::decode(num_object_copies, bl);
@@ -2078,6 +2082,7 @@ void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
 void object_stat_sum_t::add(const object_stat_sum_t& o)
 {
   num_bytes += o.num_bytes;
+  num_bytes_overlap += o.num_bytes_overlap;
   num_objects += o.num_objects;
   num_object_clones += o.num_object_clones;
   num_object_copies += o.num_object_copies;
@@ -2116,6 +2121,7 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
 void object_stat_sum_t::sub(const object_stat_sum_t& o)
 {
   num_bytes -= o.num_bytes;
+  num_bytes_overlap -= o.num_bytes_overlap;
   num_objects -= o.num_objects;
   num_object_clones -= o.num_object_clones;
   num_object_copies -= o.num_object_copies;
@@ -2155,6 +2161,7 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
 {
   return
     l.num_bytes == r.num_bytes &&
+    l.num_bytes_overlap == r.num_bytes_overlap &&
     l.num_objects == r.num_objects &&
     l.num_object_clones == r.num_object_clones &&
     l.num_object_copies == r.num_object_copies &&
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 0d22975..59b29e0 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -1577,6 +1577,7 @@ struct object_stat_sum_t {
    * adding/removing fields!
    **************************************************************************/
   int64_t num_bytes;    // in bytes
+  int64_t num_bytes_overlap;  // date in clone objects overlaps
   int64_t num_objects;
   int64_t num_object_clones;
   int64_t num_object_copies;  // num_objects * num_replicas
@@ -1612,7 +1613,7 @@ struct object_stat_sum_t {
   int64_t num_objects_missing;
 
   object_stat_sum_t()
-    : num_bytes(0),
+    : num_bytes(0), num_bytes_overlap(0),
       num_objects(0), num_object_clones(0), num_object_copies(0),
       num_objects_missing_on_primary(0), num_objects_degraded(0),
       num_objects_unfound(0),
@@ -1643,6 +1644,7 @@ struct object_stat_sum_t {
   void floor(int64_t f) {
 #define FLOOR(x) if (x < f) x = f
     FLOOR(num_bytes);
+    FLOOR(num_bytes_overlap);
     FLOOR(num_objects);
     FLOOR(num_object_clones);
     FLOOR(num_object_copies);
@@ -1689,6 +1691,7 @@ struct object_stat_sum_t {
     }                                           \
 
     SPLIT(num_bytes);
+    SPLIT(num_bytes_overlap);
     SPLIT(num_objects);
     SPLIT(num_object_clones);
     SPLIT(num_object_copies);
@@ -1745,6 +1748,7 @@ struct object_stat_sum_t {
     static_assert(
       sizeof(object_stat_sum_t) ==
         sizeof(num_bytes) +
+        sizeof(num_bytes_overlap) +
         sizeof(num_objects) +
         sizeof(num_object_clones) +
         sizeof(num_object_copies) +
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux