Re: Correct proceedure for removing ceph nodes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Took a shot at implementing the remove mon command, turned out to be
not _quite_ as trivial as I'd expected.

The monitors need to adjust their whoami values after getting the new
map, which is annoying since the value is stored in quite a few
places. Ended up removing all the direct copies I could find and had
them lookup the value in Monitor instead.
The interaction with MonClient (i.e. via ceph command) needs a bit
more work, but I'm not really sure how to proceed at this point. If
the client happens to pick the monitor that is to be removed to send
the remove command to, it'll retry with another monitor and get a
"does not exist" error (because the first operation already removed
it). If the client happens to pick a monitor whose id is greater than
the one to be removed, it will never get an response to the command.

>From 1d30deea10266392328c6fbdf176b766db42c32f Mon Sep 17 00:00:00 2001
From: Paul Chiang <paul_chiang@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 7 Jun 2010 09:47:48 +0800
Subject: [PATCH 1/2] Removed all copies of the whoami value


Signed-off-by: Paul Chiang <paul_chiang@xxxxxxxxxxxxxxxxxxx>
---
 src/mon/Elector.cc |   10 +++++-----
 src/mon/Elector.h  |    3 +--
 src/mon/Monitor.cc |    4 ++--
 src/mon/Paxos.cc   |   16 ++++++++--------
 src/mon/Paxos.h    |    5 ++---
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index 7f8fa62..41fde6e 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -69,11 +69,11 @@ void Elector::start()
     bump_epoch(epoch+1);  // odd == election cycle
   start_stamp = g_clock.now();
   electing_me = true;
-  acked_me.insert(whoami);
+  acked_me.insert(mon->whoami);

   // bcast to everyone else
   for (unsigned i=0; i<mon->monmap->size(); ++i) {
-    if ((int)i == whoami) continue;
+    if ((int)i == mon->whoami) continue;
     mon->messenger->send_message(new
MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap),
 				 mon->monmap->get_inst(i));
   }
@@ -151,7 +151,7 @@ void Elector::victory()
   for (set<int>::iterator p = quorum.begin();
        p != quorum.end();
        ++p) {
-    if (*p == whoami) continue;
+    if (*p == mon->whoami) continue;
     MMonElection *m = new MMonElection(MMonElection::OP_VICTORY,
epoch, mon->monmap);
     m->quorum = quorum;
     mon->messenger->send_message(m, mon->monmap->get_inst(*p));
@@ -186,7 +186,7 @@ void Elector::handle_propose(MMonElection *m)
     }
   }

-  if (whoami < from) {
+  if (mon->whoami < from) {
     // i would win over them.
     if (leader_acked >= 0) {        // we already acked someone
       assert(leader_acked < from);  // and they still win, of course
@@ -250,7 +250,7 @@ void Elector::handle_victory(MMonElection *m)
   dout(5) << "handle_victory from " << m->get_source() << dendl;
   int from = m->get_source().num();

-  assert(from < whoami);
+  assert(from < mon->whoami);
   assert(m->epoch % 2 == 0);

   // i should have seen this election if i'm getting the victory.
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
index 9bfd7cb..8a21e09 100644
--- a/src/mon/Elector.h
+++ b/src/mon/Elector.h
@@ -32,7 +32,6 @@ class Monitor;
 class Elector {
  private:
   Monitor *mon;
-  int whoami;

   Context *expire_event;

@@ -71,7 +70,7 @@ class Elector {
   void handle_victory(class MMonElection *m);

  public:
-  Elector(Monitor *m, int w) : mon(m), whoami(w),
+  Elector(Monitor *m) : mon(m),
 			       expire_event(0),
 			       epoch(0),
 			       electing_me(false),
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index cf0b20b..db086be 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -93,7 +93,7 @@ Monitor::Monitor(int w, MonitorStore *s, Messenger
*m, MonMap *map) :

   state(STATE_STARTING), stopping(false),

-  elector(this, w),
+  elector(this),
   mon_epoch(0),
   leader(0),
   paxos(PAXOS_NUM), paxos_service(PAXOS_NUM),
@@ -114,7 +114,7 @@ Monitor::Monitor(int w, MonitorStore *s, Messenger
*m, MonMap *map) :

 Paxos *Monitor::add_paxos(int type)
 {
-  Paxos *p = new Paxos(this, whoami, type);
+  Paxos *p = new Paxos(this, type);
   paxos[type] = p;
   return p;
 }
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index d10f31c..9c247e8 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -23,7 +23,7 @@

 #define DOUT_SUBSYS paxos
 #undef dout_prefix
-#define dout_prefix _prefix(mon, whoami, machine_name, state, last_committed)
+#define dout_prefix _prefix(mon, mon->whoami, machine_name, state,
last_committed)
 static ostream& _prefix(Monitor *mon, int whoami, const char
*machine_name, int state, version_t last_committed) {
   return *_dout << dbeginl
 		<< "mon" << whoami
@@ -87,7 +87,7 @@ void Paxos::collect(version_t oldpn)
   for (set<int>::const_iterator p = mon->get_quorum().begin();
        p != mon->get_quorum().end();
        ++p) {
-    if (*p == whoami) continue;
+    if (*p == mon->whoami) continue;

     MMonPaxos *collect = new MMonPaxos(mon->get_epoch(),
MMonPaxos::OP_COLLECT, machine_id);
     collect->last_committed = last_committed;
@@ -336,7 +336,7 @@ void Paxos::begin(bufferlist& v)

   // accept it ourselves
   accepted.clear();
-  accepted.insert(whoami);
+  accepted.insert(mon->whoami);
   new_value = v;
   mon->store->put_bl_sn(new_value, machine_name, last_committed+1);

@@ -356,7 +356,7 @@ void Paxos::begin(bufferlist& v)
   for (set<int>::const_iterator p = mon->get_quorum().begin();
        p != mon->get_quorum().end();
        ++p) {
-    if (*p == whoami) continue;
+    if (*p == mon->whoami) continue;

     dout(10) << " sending begin to mon" << *p << dendl;
     MMonPaxos *begin = new MMonPaxos(mon->get_epoch(),
MMonPaxos::OP_BEGIN, machine_id);
@@ -483,7 +483,7 @@ void Paxos::commit()
   for (set<int>::const_iterator p = mon->get_quorum().begin();
        p != mon->get_quorum().end();
        ++p) {
-    if (*p == whoami) continue;
+    if (*p == mon->whoami) continue;

     dout(10) << " sending commit to mon" << *p << dendl;
     MMonPaxos *commit = new MMonPaxos(mon->get_epoch(),
MMonPaxos::OP_COMMIT, machine_id);
@@ -527,7 +527,7 @@ void Paxos::extend_lease()
   lease_expire = g_clock.now();
   lease_expire += g_conf.mon_lease;
   acked_lease.clear();
-  acked_lease.insert(whoami);
+  acked_lease.insert(mon->whoami);

   dout(7) << "extend_lease now+" << g_conf.mon_lease << " (" <<
lease_expire << ")" << dendl;

@@ -535,7 +535,7 @@ void Paxos::extend_lease()
   for (set<int>::const_iterator p = mon->get_quorum().begin();
        p != mon->get_quorum().end();
        ++p) {
-    if (*p == whoami) continue;
+    if (*p == mon->whoami) continue;
     MMonPaxos *lease = new MMonPaxos(mon->get_epoch(),
MMonPaxos::OP_LEASE, machine_id);
     lease->last_committed = last_committed;
     lease->lease_timestamp = lease_expire;
@@ -724,7 +724,7 @@ version_t Paxos::get_new_proposal_number(version_t gt)
   last_pn /= 100;
   last_pn++;
   last_pn *= 100;
-  last_pn += (version_t)whoami;
+  last_pn += (version_t)mon->whoami;

   // write
   mon->store->put_int(last_pn, machine_name, "last_pn");
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index c6eff22..9d34605 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -68,7 +68,6 @@ class Paxos;
 // i am one state machine.
 class Paxos {
   Monitor *mon;
-  int whoami;

   // my state machine info
   int machine_id;
@@ -225,8 +224,8 @@ private:
   version_t get_new_proposal_number(version_t gt=0);

 public:
-  Paxos(Monitor *m, int w,
-	int mid) : mon(m), whoami(w),
+  Paxos(Monitor *m,
+	int mid) : mon(m),
 		   machine_id(mid),
 		   machine_name(get_paxos_name(mid)),
 		   state(STATE_RECOVERING),
-- 
1.6.4.2

>From eb7308104c33aa50195e3736483c6bf9a145a518 Mon Sep 17 00:00:00 2001
From: Paul Chiang <paul_chiang@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 7 Jun 2010 10:16:39 +0800
Subject: [PATCH 2/2] Introduced ceph mon remove command


Signed-off-by: Paul Chiang <paul_chiang@xxxxxxxxxxxxxxxxxxx>
---
 src/common/LogClient.h   |    1 +
 src/mon/Monitor.cc       |    1 +
 src/mon/Monitor.h        |    1 +
 src/mon/MonmapMonitor.cc |   61 ++++++++++++++++++++++++++++++++++++++++++++++
 src/mon/MonmapMonitor.h  |    1 +
 5 files changed, 65 insertions(+), 0 deletions(-)

diff --git a/src/common/LogClient.h b/src/common/LogClient.h
index 0185eec..bc5e227 100644
--- a/src/common/LogClient.h
+++ b/src/common/LogClient.h
@@ -56,6 +56,7 @@ class LogClient : public Dispatcher {
   void send_log();
   void handle_log_ack(MLogAck *m);
   void set_synchronous(bool sync) { is_synchronous = sync; }
+  void set_mon(int mon_id) { mon = mon_id; }

   LogClient(Messenger *m, MonMap *mm) :
     messenger(m), monmap(mm), mon(-1), is_synchronous(false),
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index db086be..259b6ce 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -110,6 +110,7 @@ Monitor::Monitor(int w, MonitorStore *s, Messenger
*m, MonMap *map) :
   mon_caps = new MonCaps();
   mon_caps->set_allow_all(true);
   mon_caps->text = "allow *";
+  myaddr = map->get_inst(w).addr;
 }

 Paxos *Monitor::add_paxos(int type)
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 829c506..a673972 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -58,6 +58,7 @@ class Monitor : public Dispatcher {
 public:
   // me
   int whoami;
+  entity_addr_t myaddr;
   Messenger *messenger;
   Mutex lock;

diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 0aca5fb..5368ab7 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -48,6 +48,7 @@ bool MonmapMonitor::update_from_paxos()
   dout(10) << "update_from_paxos paxosv " << paxosv
 	   << ", my v " << mon->monmap->epoch << dendl;

+  int original_map_size = mon->monmap->size();
   //read and decode
   monmap_bl.clear();
   bool success = paxos->read(paxosv, monmap_bl);
@@ -58,6 +59,20 @@ bool MonmapMonitor::update_from_paxos()
   //save the bufferlist version in the paxos instance as well
   paxos->stash_latest(paxosv, monmap_bl);

+  if (original_map_size != mon->monmap->size())
+  {
+    _update_whoami();
+
+    // call election?
+    if (mon->monmap->size() > 1) {
+      mon->call_election();
+    } else {
+      // we're standalone.
+      set<int> q;
+      q.insert(mon->whoami);
+      mon->win_election(1, q);
+    }
+  }
   return true;
 }

@@ -127,6 +142,8 @@ bool MonmapMonitor::preprocess_command(MMonCommand *m)
     }
     else if (m->cmd[1] == "add")
       return false;
+    else if (m->cmd[1] == "remove")
+      return false;
   }

   if (r != -1) {
@@ -177,6 +194,23 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
       paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs,
paxos->get_version()));
       return true;
     }
+    else if (m->cmd.size() == 3 && m->cmd[1] == "remove") {
+      entity_addr_t addr;
+      parse_ip_port(m->cmd[2].c_str(), addr);
+      bufferlist rdata;
+      if (!pending_map.contains(addr)) {
+        err = -ENOENT;
+        ss << "mon " << addr << " does not exist";
+        goto out;
+      }
+
+      pending_map.remove(addr);
+      pending_map.last_changed = g_clock.now();
+      ss << "removed mon" << " at " << addr << ", there are now " <<
pending_map.size() << " monitors" ;
+      getline(ss, rs);
+      paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs,
paxos->get_version()));
+      return true;
+    }
     else
       ss << "unknown command " << m->cmd[1];
   } else
@@ -203,3 +237,30 @@ void MonmapMonitor::tick()
 {
   update_from_paxos();
 }
+
+void MonmapMonitor::_update_whoami()
+{
+  // first check if there is any change
+  if (mon->whoami < mon->monmap->size() &&
+      mon->monmap->get_inst(mon->whoami).addr == mon->myaddr)
+  {
+    return;
+  }
+
+  // then check backwards starting from min(whoami-1, size-1) since
whoami only ever decreases
+  unsigned i=(mon->whoami-1) < mon->monmap->size() ?
(mon->whoami-1):(mon->monmap->size()-1);
+  for (; i>=0; i--)
+  {
+    if (mon->monmap->get_inst(i).addr == mon->myaddr)
+    {
+      dout(10) << "Changing whoami from " << mon->whoami << " to " <<
i << dendl;
+      mon->whoami = i;
+      mon->messenger->set_myname(entity_name_t::MON(i));
+      mon->logclient.set_mon(i);
+      return;
+    }
+  }
+  dout(0) << "Cannot find myself (mon" << mon->whoami << ", " <<
mon->myaddr << ") in new monmap! I must have been removed, shutting
down." << dendl;
+  mon->shutdown();
+}
+
diff --git a/src/mon/MonmapMonitor.h b/src/mon/MonmapMonitor.h
index fbb7fa5..b14c012 100644
--- a/src/mon/MonmapMonitor.h
+++ b/src/mon/MonmapMonitor.h
@@ -67,6 +67,7 @@ class MonmapMonitor : public PaxosService {

  private:
   bufferlist monmap_bl;
+  void _update_whoami();
 };


-- 
1.6.4.2
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux