If a crush rule change wants to move shards of EC pools, and any file fails to read due to a bad block, the osd will crash, and to avoid further unexpected failures it must be kept down until other OSDs recover the PG. With ceph_ectool, you can recompute the contents of the broken file out of the object proper, if the PG is active, or out of files extracted from other active shards, and replace the broken file, so that the OSD won't crash any more. I'm aware of the jerasure example program that purports to do the same, but the erasures generated by it are not compatible with those generated by ceph, at least not for the cauchy_good erasures in my 4+4 setting. With ceph_ectool, I have managed to recover some severely degraded objects and recompute data and erasure codes of various files. Signed-off-by: Alexandre Oliva <oliva@xxxxxxx> --- src/tools/Makefile.am | 6 + src/tools/ceph_ectool.cc | 247 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 src/tools/ceph_ectool.cc diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am index 1a73995..e4ea0d0 100644 --- a/src/tools/Makefile.am +++ b/src/tools/Makefile.am @@ -102,6 +102,12 @@ ceph_mon_store_converter_SOURCES = tools/mon_store_converter.cc ceph_mon_store_converter_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) bin_PROGRAMS += ceph_mon_store_converter +ceph_ectool_SOURCES = tools/ceph_ectool.cc +ceph_ectool_CXXFLAGS = ${AM_CXXFLAGS} \ + -I$(top_srcdir)/src/erasure-code -I$(top_srcdir)/src/osd +ceph_ectool_LDADD = $(LIBOS) $(LIBOSD_TYPES) $(CEPH_GLOBAL) +bin_PROGRAMS += ceph_ectool + noinst_HEADERS += \ tools/cephfs/JournalTool.h \ tools/cephfs/JournalScanner.h \ diff --git a/src/tools/ceph_ectool.cc b/src/tools/ceph_ectool.cc new file mode 100644 index 0000000..04b3894 --- /dev/null +++ b/src/tools/ceph_ectool.cc @@ -0,0 +1,247 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Alexandre Oliva <oliva@xxxxxxx> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "erasure-code/ErasureCodePlugin.h" +#include "osd/ECUtil.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "common/config.h" +#include <unistd.h> +#include <fcntl.h> +#include <sys/stat.h> + +int main(int argc, char *argv[]) { + if (argc < 3) + { + cout << "usage: ceph_ectool profile stripe object shard0 shard1..." << endl + << "profile names a file with the output of the command" << endl + << " ceph osd erasure-code-profile get <profile>" << endl + << "stripe is the stripe width of the pool" << endl + << "object is the full object filename" << endl + << "shard# is the filename of the given shard" << endl + << "files named -- or an empty string are assumed absent" << endl + << "files named but absent are created with decoded data" << endl + << "given object and shards are verified against each other" << endl; + return 0; + } + + vector<const char *> ceph_options, def_args; + global_init(&def_args, ceph_options, CEPH_ENTITY_TYPE_OSD, + CODE_ENVIRONMENT_UTILITY_NODOUT, 0); + common_init_finish(g_ceph_context); + g_conf = g_ceph_context->_conf; + g_conf->set_val_or_die("log_to_stderr", "true"); + g_conf->set_val_or_die("err_to_stderr", "true"); + g_conf->apply_changes(NULL); + + map<string,string> profile; + { + char *pname = argv[1]; + int fd = open(pname, O_RDONLY); + if (fd == -1) { + cerr << pname << " not found" << endl; + return 1; + } + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = fstat(fd, &st); + assert(r == 0); + int len = st.st_size; + char buf[len]; + if (read(fd, buf, len) != len) { + cerr << pname << ": failed to read " << len << " bytes" << endl; + return 1; + } + for (int i = 0; i < len; i++) { + string first; + while (i < len && buf[i] != '=' && buf[i] != '\n') + first += buf[i++]; + if (i == len || buf[i] == '\n') { + cerr << pname << " does not look like a profile at line " << first << endl; + return 1; + } + assert (buf[i] == '='); + i++; + string second; + while (i < len && buf[i] != '\n') + second += buf[i++]; + if (i < len) + assert (buf[i] == '\n'); + profile[first] = second; + } + if (!profile.count("plugin")) { + cerr << pname << " does not define plugin=<name>" << endl; + return 1; + } + } + ErasureCodeInterfaceRef ec_impl; + { + stringstream ss; + ceph::ErasureCodePluginRegistry::instance(). + factory(profile.find("plugin")->second, + profile, &ec_impl, ss); + assert(ec_impl); + } + + int k = ec_impl->get_data_chunk_count(); + int m = ec_impl->get_coding_chunk_count(); + int stripe_width = atoi (argv[2]); + + if (stripe_width <= 0 || stripe_width % k) { + cerr << "stripe must be a positive multiple of " << k << endl; + return 1; + } + + if (argc != 1 + 2 + 1 + k + m) { + cerr << "not enough filenames given: got " << argc - 3 + << ", expected " << 1 + k + m << endl; + return 1; + } + + char **name = argv + 3; + bool named[1+k+m]; + bool present[1+k+m]; + bufferlist data[1+k+m]; + int bufsize = 0; + int objsize = 0; + int size_file = -1; + for (int i = 0, a = 2; i <= k + m; i++, a++) { + named[i] = present[i] = name[i] && strcmp (name[i], "--"); + if (named[i]) { + int fd = ::open(name[i], O_RDONLY); + if (fd < 0) + present[i] = false; + else { + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = fstat(fd, &st); + assert(r == 0); + int len = st.st_size; + int xobjsize = len * (i ? k : 1); + if (size_file == -1) { + if (xobjsize % stripe_width) { + if (!i) + cerr << "object must be padded to a multiple of " + << stripe_width << " bytes" << endl; + else + cerr << "shard must be padded to be a multiple of " + << stripe_width / k << " bytes" << endl; + return 1; + } + + bufsize = xobjsize / k; + objsize = xobjsize; + size_file = i; + } else if ((i ? bufsize : objsize) != len) { + cerr << name[i] << " has " << len << " bytes, but based on " + << name[size_file] << " we expected " + << (i ? bufsize : objsize) << " bytes" << endl; + return 1; + } + + bufferptr bp(len); + if (read(fd, bp.c_str(), len) != len) { + cerr << name[i] << ": failed to read " << len << " bytes" << endl; + return 1; + } + + close(fd); + + data[i].push_back(bp); + } + } + } + + if (size_file == -1) { + cerr << "no given file" << endl; + return 1; + } + + ECUtil::stripe_info_t sinfo(ec_impl->get_data_chunk_count(), stripe_width); + + if (!present[0]) { + map<int, bufferlist> to_decode; + map<int, bufferlist*> out; + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + if (present[i]) + to_decode[s] = data[i]; + else + out[s] = &data[i]; + } + + if (named[0] && !present[0]) { + if (ECUtil::decode(sinfo, ec_impl, to_decode, &data[0]) != 0) { + cerr << "reconstruction of object failed" << endl; + return 1; + } + + int fd = creat(name[0], 0600); + if (data[0].write_fd(fd) != 0) { + cerr << "failed to write to " << name[0] << endl; + return 1; + } + } + + if (ECUtil::decode(sinfo, ec_impl, to_decode, out) != 0) { + cerr << "reconstruction of missing shards failed" << endl; + return 1; + } + + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + if (!present[i]) { + int fd = creat(name[i], 0600); + if (data[i].write_fd(fd) != 0 || close(fd) != 0) { + cerr << "failed to write to " << name[i] << endl; + return 1; + } + } + } + } else { + set<int> want; + map<int, bufferlist> out; + + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + want.insert(s); + } + + if (ECUtil::encode(sinfo, ec_impl, data[0], want, &out) != 0) { + cerr << "encoding failed" << endl; + return 1; + } + + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + if (!present[i]) { + int fd = creat(name[i], 0600); + if (out[s].write_fd(fd) != 0 || close(fd) != 0) { + cerr << "failed to write to " << name[i] << endl; + return 1; + } + } else if (!(data[i] == out[s])) { + cerr << name[i] << " deviates from the data encoded from " + << name[0] << endl; + return 1; + } + } + } +} -- Alexandre Oliva, freedom fighter http://FSFLA.org/~lxoliva/ You must be the change you wish to see in the world. -- Gandhi Be Free! -- http://FSFLA.org/ FSF Latin America board member Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html