The attempt to reduce latency for ReplicatedBackend

chen kael <chenji.bupt@xxxxxxxxx> · Wed, 4 Mar 2020 18:23:40 +0800

Dear Cepher
As all we know, PrimaryPG will never return success to client until
all replica send their MSG_OSD_REPOPREPLY message. This does help ceph
to obtain the strong consistence all the time.

I have made some small changes to this golden rule in Ceph in order to
reduce the latency during random write as much as possible.

In the test patch (please see below), we assume pool size is 3,  the
'success' will be returned to client once PrimaryPG has received any
two of the messages, in this case everything works well, including
recovery and backfill etc. In the meanwhile, the Latency of 4K-random
write in my HDD cluster is almost reduced by 50%, which is cut down
from 20ms to 10ms.

However, with this patch, balance read must be disabled since stale
data could be read from replica PG. And the reliability of the data
will be compromised. Even so, I think the effect is limited since
these missing data can be recovered by using PGLOG.

Besides, this modification also made me a little bit anxious. The
reason is that some bugs have been detected and fixed during testing.
Considering this, I am not sure if there will be any other bugs
related to this modification. Therefore, I really hope to get your
suggestions in order to avoid the unexpected bugs in advance.

It will be highly appreciated if any of you would like to share your
experience and comment.
Big thanks in advance

This patch is base on 14.2.7

diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
index 79d9c1f..464fba1 100644
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -873,6 +873,8 @@ OPTION(osd_requested_scrub_priority, OPT_U32)
 OPTION(osd_pg_delete_priority, OPT_U32)
 OPTION(osd_pg_delete_cost, OPT_U32) // set default cost equal to 1MB io

+OPTION(osd_pg_allow_majority_write, OPT_BOOL) // set default cost
equal to 1MB io
+
 OPTION(osd_recovery_priority, OPT_U32)
 // set default cost equal to 20MB io
 OPTION(osd_recovery_cost, OPT_U32)
diff --git a/src/common/options.cc b/src/common/options.cc
index f0ba13a..26242eb 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -4082,6 +4082,10 @@ std::vector<Option> get_global_options() {
     .set_default(1<<20)
     .set_description(""),

+    Option("osd_pg_allow_majority_write", Option::TYPE_BOOL,
Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
     Option("osd_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(5)
     .set_description("Priority for scrub operations in work queue"),
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index cd52976..ad293ec 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3936,12 +3936,23 @@ void PG::write_if_dirty(ObjectStore::Transaction& t)

 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
 {
+  // test if this assertion can pass for ever!
+  dout(10) << "add_log_entry e.version" << e.version << dendl;
+  dout(10) << "add_log_entry e.user_version" << e.user_version << dendl;
+  dout(10) << "add_log_entry info.last_user_version" <<
info.last_user_version << dendl;
+  dout(10) << "add_log_entry info.last_complete" <<
info.last_complete << dendl;
+  dout(10) << "add_log_entry info.last_update" << info.last_update << dendl;
+
   // raise last_complete only if we were previously up to date
   if (info.last_complete == info.last_update)
+  {
+    ceph_assert(e.version.version == info.last_complete.version + 1);
     info.last_complete = e.version;
-
+  }
+
   // raise last_update.
   ceph_assert(e.version > info.last_update);
+  ceph_assert(e.version.version == info.last_update.version + 1);
   info.last_update = e.version;

   // raise user_version, if it increased (it may have not get bumped
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index f8d67af..1aca6e8 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -509,9 +509,11 @@ void ReplicatedBackend::submit_transaction(
 void ReplicatedBackend::op_commit(
   InProgressOpRef& op)
 {
-  if (op->on_commit == nullptr) {
-    // aborted
-    return;
+  if (!cct->_conf->osd_pg_allow_majority_write) {
+    if (op->on_commit == nullptr) {
+      // aborted
+      return;
+    }
   }

   FUNCTRACE(cct);
@@ -523,17 +525,26 @@ void ReplicatedBackend::op_commit(
   }

   op->waiting_for_commit.erase(get_parent()->whoami_shard());
-
-  if (op->waiting_for_commit.empty()) {
-    op->on_commit->complete(0);
-    op->on_commit = 0;
-    in_progress_ops.erase(op->tid);
+  if (cct->_conf->osd_pg_allow_majority_write) {
+    if (op->waiting_for_commit.size() == 1 && op->on_commit) {
+      op->on_commit->complete(0);
+      op->on_commit = 0;
+    }
+    if (op->waiting_for_commit.empty()) {
+      in_progress_ops.erase(op->tid);
+    }
+  } else {
+    if (op->waiting_for_commit.empty()) {
+      op->on_commit->complete(0);
+      op->on_commit = 0;
+      in_progress_ops.erase(op->tid);
+    }
   }
 }

 void ReplicatedBackend::do_repop_reply(OpRequestRef op)
 {
-  static_cast<MOSDRepOpReply*>(op->get_nonconst_req())->finish_decode();
+  static_cast<MOSDRepOpReply *>(op->get_nonconst_req())->finish_decode();
   const MOSDRepOpReply *r = static_cast<const MOSDRepOpReply *>(op->get_req());
   ceph_assert(r->get_header().type == MSG_OSD_REPOPREPLY);

@@ -552,14 +563,14 @@ void ReplicatedBackend::do_repop_reply(OpRequestRef op)

     if (m)
       dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m
-       << " ack_type " << (int)r->ack_type
-       << " from " << from
-       << dendl;
+              << " ack_type " << (int)r->ack_type
+              << " from " << from
+              << dendl;
     else
       dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) "
-       << " ack_type " << (int)r->ack_type
-       << " from " << from
-       << dendl;
+              << " ack_type " << (int)r->ack_type
+              << " from " << from
+              << dendl;

     // oh, good.

@@ -567,8 +578,8 @@ void ReplicatedBackend::do_repop_reply(OpRequestRef op)
       ceph_assert(ip_op.waiting_for_commit.count(from));
       ip_op.waiting_for_commit.erase(from);
       if (ip_op.op) {
- ip_op.op->mark_event("sub_op_commit_rec");
- ip_op.op->pg_trace.event("sub_op_commit_rec");
+        ip_op.op->mark_event("sub_op_commit_rec");
+        ip_op.op->pg_trace.event("sub_op_commit_rec");
       }
     } else {
       // legacy peer; ignore
@@ -578,12 +589,28 @@ void ReplicatedBackend::do_repop_reply(OpRequestRef op)
       from,
       r->get_last_complete_ondisk());

-    if (ip_op.waiting_for_commit.empty() &&
+    if (cct->_conf->osd_pg_allow_majority_write) {
+      if (ip_op.waiting_for_commit.size() == 1 &&
         ip_op.on_commit) {
-      ip_op.on_commit->complete(0);
-      ip_op.on_commit = 0;
-      in_progress_ops.erase(iter);
+        ip_op.on_commit->complete(0);
+        ip_op.on_commit = 0;
+      }
+      if (ip_op.waiting_for_commit.empty()) {
+        in_progress_ops.erase(iter);
+      }
+    } else {
+      if (ip_op.waiting_for_commit.empty() && ip_op.on_commit) {
+        ip_op.on_commit->complete(0);
+        ip_op.on_commit = 0;
+        in_progress_ops.erase(iter);
+      }
     }
+
+    dout(10) << __func__ << ": tid " << ip_op.tid << " op " //<< *m
+              << " ack_type " << (int)r->ack_type
+              << " from " << from
+              << " in_progress_ops size " << in_progress_ops.size()
+              << dendl;
   }
 }