Introduce ceph_ectool

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



If a crush rule change wants to move shards of EC pools, and any file
fails to read due to a bad block, the osd will crash, and to avoid
further unexpected failures it must be kept down until other OSDs
recover the PG.

With ceph_ectool, you can recompute the contents of the broken file out
of the object proper, if the PG is active, or out of files extracted
from other active shards, and replace the broken file, so that the OSD
won't crash any more.

I'm aware of the jerasure example program that purports to do the same,
but the erasures generated by it are not compatible with those generated
by ceph, at least not for the cauchy_good erasures in my 4+4 setting.
With ceph_ectool, I have managed to recover some severely degraded
objects and recompute data and erasure codes of various files.

Signed-off-by: Alexandre Oliva <oliva@xxxxxxx>
---
 src/tools/Makefile.am    |    6 +
 src/tools/ceph_ectool.cc |  247 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 253 insertions(+)
 create mode 100644 src/tools/ceph_ectool.cc

diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am
index 1a73995..e4ea0d0 100644
--- a/src/tools/Makefile.am
+++ b/src/tools/Makefile.am
@@ -102,6 +102,12 @@ ceph_mon_store_converter_SOURCES = tools/mon_store_converter.cc
 ceph_mon_store_converter_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL)
 bin_PROGRAMS += ceph_mon_store_converter
 
+ceph_ectool_SOURCES = tools/ceph_ectool.cc
+ceph_ectool_CXXFLAGS = ${AM_CXXFLAGS} \
+	-I$(top_srcdir)/src/erasure-code -I$(top_srcdir)/src/osd
+ceph_ectool_LDADD = $(LIBOS) $(LIBOSD_TYPES) $(CEPH_GLOBAL)
+bin_PROGRAMS += ceph_ectool
+
 noinst_HEADERS += \
 	tools/cephfs/JournalTool.h \
 	tools/cephfs/JournalScanner.h \
diff --git a/src/tools/ceph_ectool.cc b/src/tools/ceph_ectool.cc
new file mode 100644
index 0000000..04b3894
--- /dev/null
+++ b/src/tools/ceph_ectool.cc
@@ -0,0 +1,247 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Alexandre Oliva <oliva@xxxxxxx>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "erasure-code/ErasureCodePlugin.h"
+#include "osd/ECUtil.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "common/config.h"
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+int main(int argc, char *argv[]) {
+  if (argc < 3)
+    {
+      cout << "usage: ceph_ectool profile stripe object shard0 shard1..." << endl
+	   << "profile names a file with the output of the command" << endl
+	   << "  ceph osd erasure-code-profile get <profile>" << endl
+	   << "stripe is the stripe width of the pool" << endl
+	   << "object is the full object filename" << endl
+	   << "shard# is the filename of the given shard" << endl
+	   << "files named -- or an empty string are assumed absent" << endl
+	   << "files named but absent are created with decoded data" << endl
+	   << "given object and shards are verified against each other" << endl;
+      return 0;
+    }
+
+  vector<const char *> ceph_options, def_args;
+  global_init(&def_args, ceph_options, CEPH_ENTITY_TYPE_OSD,
+	      CODE_ENVIRONMENT_UTILITY_NODOUT, 0);
+  common_init_finish(g_ceph_context);
+  g_conf = g_ceph_context->_conf;
+  g_conf->set_val_or_die("log_to_stderr", "true");
+  g_conf->set_val_or_die("err_to_stderr", "true");
+  g_conf->apply_changes(NULL);
+
+  map<string,string> profile;
+  {
+    char *pname = argv[1];
+    int fd = open(pname, O_RDONLY);
+    if (fd == -1) {
+      cerr << pname << " not found" << endl;
+      return 1;
+    }
+    struct stat st;
+    memset(&st, 0, sizeof(struct stat));
+    int r = fstat(fd, &st);
+    assert(r == 0);
+    int len = st.st_size;
+    char buf[len];
+    if (read(fd, buf, len) != len) {
+      cerr << pname << ": failed to read " << len << " bytes" << endl;
+      return 1;
+    }
+    for (int i = 0; i < len; i++) {
+      string first;
+      while (i < len && buf[i] != '=' && buf[i] != '\n')
+	first += buf[i++];
+      if (i == len || buf[i] == '\n') {
+	cerr << pname << " does not look like a profile at line " << first << endl;
+	return 1;
+      }
+      assert (buf[i] == '=');
+      i++;
+      string second;
+      while (i < len && buf[i] != '\n')
+	second += buf[i++];
+      if (i < len)
+	assert (buf[i] == '\n');
+      profile[first] = second;
+    }
+    if (!profile.count("plugin")) {
+      cerr << pname << " does not define plugin=<name>" << endl;
+      return 1;
+    }
+  }
+  ErasureCodeInterfaceRef ec_impl;
+  {
+    stringstream ss;
+    ceph::ErasureCodePluginRegistry::instance().
+      factory(profile.find("plugin")->second,
+	      profile, &ec_impl, ss);
+    assert(ec_impl);
+  }
+
+  int k = ec_impl->get_data_chunk_count();
+  int m = ec_impl->get_coding_chunk_count();
+  int stripe_width = atoi (argv[2]);
+
+  if (stripe_width <= 0 || stripe_width % k) {
+    cerr << "stripe must be a positive multiple of " << k << endl;
+    return 1;
+  }
+
+  if (argc != 1 + 2 + 1 + k + m) {
+    cerr << "not enough filenames given: got " << argc - 3
+	 << ", expected " << 1 + k + m << endl;
+    return 1;
+  }
+
+  char **name = argv + 3;
+  bool named[1+k+m];
+  bool present[1+k+m];
+  bufferlist data[1+k+m];
+  int bufsize = 0;
+  int objsize = 0;
+  int size_file = -1;
+  for (int i = 0, a = 2; i <= k + m; i++, a++) {
+    named[i] = present[i] = name[i] && strcmp (name[i], "--");
+    if (named[i]) {
+      int fd = ::open(name[i], O_RDONLY);
+      if (fd < 0)
+	present[i] = false;
+      else {
+	struct stat st;
+	memset(&st, 0, sizeof(struct stat));
+	int r = fstat(fd, &st);
+	assert(r == 0);
+	int len = st.st_size;
+	int xobjsize = len * (i ? k : 1);
+	if (size_file == -1) {
+	  if (xobjsize % stripe_width) {
+	    if (!i)
+	      cerr << "object must be padded to a multiple of "
+		   << stripe_width << " bytes" << endl;
+	    else
+	      cerr << "shard must be padded to be a multiple of "
+		   << stripe_width / k << " bytes" << endl;
+	    return 1;
+	  }
+
+	  bufsize = xobjsize / k;
+	  objsize = xobjsize;
+	  size_file = i;
+	} else if ((i ? bufsize : objsize) != len) {
+	  cerr << name[i] << " has " << len << " bytes, but based on "
+	       << name[size_file] << " we expected "
+	       << (i ? bufsize : objsize) << " bytes" << endl;
+	  return 1;
+	}
+
+	bufferptr bp(len);
+	if (read(fd, bp.c_str(), len) != len) {
+	  cerr << name[i] << ": failed to read " << len << " bytes" << endl;
+	  return 1;
+	}
+
+	close(fd);
+
+	data[i].push_back(bp);
+      }
+    }
+  }
+
+  if (size_file == -1) {
+    cerr << "no given file" << endl;
+    return 1;
+  }
+
+  ECUtil::stripe_info_t sinfo(ec_impl->get_data_chunk_count(), stripe_width);
+
+  if (!present[0]) {
+    map<int, bufferlist> to_decode;
+    map<int, bufferlist*> out;
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      if (present[i])
+	to_decode[s] = data[i];
+      else
+	out[s] = &data[i];
+    }
+
+    if (named[0] && !present[0]) {
+      if (ECUtil::decode(sinfo, ec_impl, to_decode, &data[0]) != 0) {
+	cerr << "reconstruction of object failed" << endl;
+	return 1;
+      }
+
+      int fd = creat(name[0], 0600);
+      if (data[0].write_fd(fd) != 0) {
+	cerr << "failed to write to " << name[0] << endl;
+	return 1;
+      }
+    }
+
+    if (ECUtil::decode(sinfo, ec_impl, to_decode, out) != 0) {
+      cerr << "reconstruction of missing shards failed" << endl;
+      return 1;
+    }
+
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      if (!present[i]) {
+	int fd = creat(name[i], 0600);
+	if (data[i].write_fd(fd) != 0 || close(fd) != 0) {
+	  cerr << "failed to write to " << name[i] << endl;
+	  return 1;
+	}
+      }
+    }
+  } else {
+    set<int> want;
+    map<int, bufferlist> out;
+
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      want.insert(s);
+    }
+
+    if (ECUtil::encode(sinfo, ec_impl, data[0], want, &out) != 0) {
+      cerr << "encoding failed" << endl;
+      return 1;
+    }
+
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      if (!present[i]) {
+	int fd = creat(name[i], 0600);
+	if (out[s].write_fd(fd) != 0 || close(fd) != 0) {
+	  cerr << "failed to write to " << name[i] << endl;
+	  return 1;
+	}
+      } else if (!(data[i] == out[s])) {
+	cerr << name[i] << " deviates from the data encoded from "
+	     << name[0] << endl;
+	return 1;
+      }
+    }
+  }
+}

-- 
Alexandre Oliva, freedom fighter    http://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux