The following changes since commit f2cd91604af170e972438c461a40230e266a57d9: debug: fix inverted logic in fio_did_warn() (2018-02-12 10:55:07 -0700) are available in the git repository at: git://git.kernel.dk/fio.git master for you to fetch changes up to 488468793d72297f7dccd42a7b50011e74de0688: Merge branch 'wip-ifed-rados' of https://github.com/ifed01/fio (2018-02-14 12:43:04 -0700) ---------------------------------------------------------------- Igor Fedotov (1): Add support for Ceph Rados benchmarking. Jens Axboe (1): Merge branch 'wip-ifed-rados' of https://github.com/ifed01/fio Makefile | 3 + configure | 34 ++++ engines/rados.c | 479 +++++++++++++++++++++++++++++++++++++++++++++++++++++ examples/rados.fio | 24 +++ fio.1 | 17 +- 5 files changed, 553 insertions(+), 4 deletions(-) create mode 100644 engines/rados.c create mode 100644 examples/rados.fio --- Diff of recent changes: diff --git a/Makefile b/Makefile index 3ce6064..c25b422 100644 --- a/Makefile +++ b/Makefile @@ -95,6 +95,9 @@ endif ifdef CONFIG_WINDOWSAIO SOURCE += engines/windowsaio.c endif +ifdef CONFIG_RADOS + SOURCE += engines/rados.c +endif ifdef CONFIG_RBD SOURCE += engines/rbd.c endif diff --git a/configure b/configure index d92bb0f..5d283d7 100755 --- a/configure +++ b/configure @@ -173,6 +173,8 @@ for opt do ;; --disable-rdma) disable_rdma="yes" ;; + --disable-rados) disable_rados="yes" + ;; --disable-rbd) disable_rbd="yes" ;; --disable-rbd-blkin) disable_rbd_blkin="yes" @@ -1527,6 +1529,35 @@ fi print_config "IPv6 helpers" "$ipv6" ########################################## +# check for rados +if test "$rados" != "yes" ; then + rados="no" +fi +cat > $TMPC << EOF +#include <rados/librados.h> + +int main(int argc, char **argv) +{ + rados_t cluster; + rados_ioctx_t io_ctx; + const char cluster_name[] = "ceph"; + const char user_name[] = "client.admin"; + const char pool[] = "rados"; + + /* The rados_create2 signature required was only introduced in ceph 0.65 */ + rados_create2(&cluster, cluster_name, user_name, 0); + rados_ioctx_create(cluster, pool, &io_ctx); + + return 0; +} +EOF +if test "$disable_rados" != "yes" && compile_prog "" "-lrados" "rados"; then + LIBS="-lrados $LIBS" + rados="yes" +fi +print_config "Rados engine" "$rados" + +########################################## # check for rbd if test "$rbd" != "yes" ; then rbd="no" @@ -2262,6 +2293,9 @@ fi if test "$ipv6" = "yes" ; then output_sym "CONFIG_IPV6" fi +if test "$rados" = "yes" ; then + output_sym "CONFIG_RADOS" +fi if test "$rbd" = "yes" ; then output_sym "CONFIG_RBD" fi diff --git a/engines/rados.c b/engines/rados.c new file mode 100644 index 0000000..dc0d7b1 --- /dev/null +++ b/engines/rados.c @@ -0,0 +1,479 @@ +/* + * Ceph Rados engine + * + * IO engine using Ceph's RADOS interface to test low-level performance of + * Ceph OSDs. + * + */ + +#include <rados/librados.h> +#include <pthread.h> +#include "fio.h" +#include "../optgroup.h" + +struct fio_rados_iou { + struct thread_data *td; + struct io_u *io_u; + rados_completion_t completion; + rados_write_op_t write_op; +}; + +struct rados_data { + rados_t cluster; + rados_ioctx_t io_ctx; + char **objects; + size_t object_count; + struct io_u **aio_events; + bool connected; +}; + +/* fio configuration options read from the job file */ +struct rados_options { + void *pad; + char *cluster_name; + char *pool_name; + char *client_name; + int busy_poll; +}; + +static struct fio_option options[] = { + { + .name = "clustername", + .lname = "ceph cluster name", + .type = FIO_OPT_STR_STORE, + .help = "Cluster name for ceph", + .off1 = offsetof(struct rados_options, cluster_name), + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_RBD, + }, + { + .name = "pool", + .lname = "pool name to use", + .type = FIO_OPT_STR_STORE, + .help = "Ceph pool name to benchmark against", + .off1 = offsetof(struct rados_options, pool_name), + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_RBD, + }, + { + .name = "clientname", + .lname = "rados engine clientname", + .type = FIO_OPT_STR_STORE, + .help = "Name of the ceph client to access RADOS engine", + .off1 = offsetof(struct rados_options, client_name), + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_RBD, + }, + { + .name = "busy_poll", + .lname = "busy poll mode", + .type = FIO_OPT_BOOL, + .help = "Busy poll for completions instead of sleeping", + .off1 = offsetof(struct rados_options, busy_poll), + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_RBD, + }, + { + .name = NULL, + }, +}; + +static int _fio_setup_rados_data(struct thread_data *td, + struct rados_data **rados_data_ptr) +{ + struct rados_data *rados; + + if (td->io_ops_data) + return 0; + + rados = calloc(1, sizeof(struct rados_data)); + if (!rados) + goto failed; + + rados->connected = false; + + rados->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *)); + if (!rados->aio_events) + goto failed; + + rados->object_count = td->o.nr_files; + rados->objects = calloc(rados->object_count, sizeof(char*)); + if (!rados->objects) + goto failed; + + *rados_data_ptr = rados; + return 0; + +failed: + if (rados) { + rados->object_count = 0; + if (rados->aio_events) + free(rados->aio_events); + free(rados); + } + return 1; +} + +static void _fio_rados_rm_objects(struct rados_data *rados) +{ + size_t i; + for (i = 0; i < rados->object_count; ++i) { + if (rados->objects[i]) { + rados_remove(rados->io_ctx, rados->objects[i]); + free(rados->objects[i]); + rados->objects[i] = NULL; + } + } +} + +static int _fio_rados_connect(struct thread_data *td) +{ + struct rados_data *rados = td->io_ops_data; + struct rados_options *o = td->eo; + int r; + const uint64_t file_size = + td->o.size / (td->o.nr_files ? td->o.nr_files : 1u); + struct fio_file *f; + uint32_t i; + size_t oname_len = 0; + + if (o->cluster_name) { + char *client_name = NULL; + + /* + * If we specify cluser name, the rados_create2 + * will not assume 'client.'. name is considered + * as a full type.id namestr + */ + if (o->client_name) { + if (!index(o->client_name, '.')) { + client_name = calloc(1, strlen("client.") + + strlen(o->client_name) + 1); + strcat(client_name, "client."); + strcat(client_name, o->client_name); + } else { + client_name = o->client_name; + } + } + + r = rados_create2(&rados->cluster, o->cluster_name, + client_name, 0); + + if (client_name && !index(o->client_name, '.')) + free(client_name); + } else + r = rados_create(&rados->cluster, o->client_name); + + if (r < 0) { + log_err("rados_create failed.\n"); + goto failed_early; + } + + r = rados_conf_read_file(rados->cluster, NULL); + if (r < 0) { + log_err("rados_conf_read_file failed.\n"); + goto failed_early; + } + + r = rados_connect(rados->cluster); + if (r < 0) { + log_err("rados_connect failed.\n"); + goto failed_early; + } + + r = rados_ioctx_create(rados->cluster, o->pool_name, &rados->io_ctx); + if (r < 0) { + log_err("rados_ioctx_create failed.\n"); + goto failed_shutdown; + } + + for (i = 0; i < rados->object_count; i++) { + f = td->files[i]; + f->real_file_size = file_size; + f->engine_pos = i; + + oname_len = strlen(f->file_name) + 32; + rados->objects[i] = malloc(oname_len); + /* vary objects for different jobs */ + snprintf(rados->objects[i], oname_len - 1, + "fio_rados_bench.%s.%x", + f->file_name, td->thread_number); + r = rados_write(rados->io_ctx, rados->objects[i], "", 0, 0); + if (r < 0) { + free(rados->objects[i]); + rados->objects[i] = NULL; + log_err("error creating object.\n"); + goto failed_obj_create; + } + } + + return 0; + +failed_obj_create: + _fio_rados_rm_objects(rados); + rados_ioctx_destroy(rados->io_ctx); + rados->io_ctx = NULL; +failed_shutdown: + rados_shutdown(rados->cluster); + rados->cluster = NULL; +failed_early: + return 1; +} + +static void _fio_rados_disconnect(struct rados_data *rados) +{ + if (!rados) + return; + + _fio_rados_rm_objects(rados); + + if (rados->io_ctx) { + rados_ioctx_destroy(rados->io_ctx); + rados->io_ctx = NULL; + } + + if (rados->cluster) { + rados_shutdown(rados->cluster); + rados->cluster = NULL; + } +} + +static void fio_rados_cleanup(struct thread_data *td) +{ + struct rados_data *rados = td->io_ops_data; + + if (rados) { + _fio_rados_disconnect(rados); + free(rados->objects); + free(rados->aio_events); + free(rados); + } +} + +static int fio_rados_queue(struct thread_data *td, struct io_u *io_u) +{ + struct rados_data *rados = td->io_ops_data; + struct fio_rados_iou *fri = io_u->engine_data; + char *object = rados->objects[io_u->file->engine_pos]; + int r = -1; + + fio_ro_check(td, io_u); + + if (io_u->ddir == DDIR_WRITE) { + r = rados_aio_create_completion(fri, NULL, + NULL, &fri->completion); + if (r < 0) { + log_err("rados_aio_create_completion failed.\n"); + goto failed; + } + + r = rados_aio_write(rados->io_ctx, object, fri->completion, + io_u->xfer_buf, io_u->xfer_buflen, io_u->offset); + if (r < 0) { + log_err("rados_write failed.\n"); + goto failed_comp; + } + return FIO_Q_QUEUED; + } else if (io_u->ddir == DDIR_READ) { + r = rados_aio_create_completion(fri, NULL, + NULL, &fri->completion); + if (r < 0) { + log_err("rados_aio_create_completion failed.\n"); + goto failed; + } + r = rados_aio_read(rados->io_ctx, object, fri->completion, + io_u->xfer_buf, io_u->xfer_buflen, io_u->offset); + if (r < 0) { + log_err("rados_aio_read failed.\n"); + goto failed_comp; + } + return FIO_Q_QUEUED; + } else if (io_u->ddir == DDIR_TRIM) { + r = rados_aio_create_completion(fri, NULL, + NULL , &fri->completion); + if (r < 0) { + log_err("rados_aio_create_completion failed.\n"); + goto failed; + } + fri->write_op = rados_create_write_op(); + if (fri->write_op == NULL) { + log_err("rados_create_write_op failed.\n"); + goto failed_comp; + } + rados_write_op_zero(fri->write_op, io_u->offset, + io_u->xfer_buflen); + r = rados_aio_write_op_operate(fri->write_op, rados->io_ctx, + fri->completion, object, NULL, 0); + if (r < 0) { + log_err("rados_aio_write_op_operate failed.\n"); + goto failed_write_op; + } + return FIO_Q_QUEUED; + } + + log_err("WARNING: Only DDIR_READ, DDIR_WRITE and DDIR_TRIM are supported!"); + +failed_write_op: + rados_release_write_op(fri->write_op); +failed_comp: + rados_aio_release(fri->completion); +failed: + io_u->error = -r; + td_verror(td, io_u->error, "xfer"); + return FIO_Q_COMPLETED; +} + +static struct io_u *fio_rados_event(struct thread_data *td, int event) +{ + struct rados_data *rados = td->io_ops_data; + return rados->aio_events[event]; +} + +int fio_rados_getevents(struct thread_data *td, unsigned int min, + unsigned int max, const struct timespec *t) +{ + struct rados_data *rados = td->io_ops_data; + struct rados_options *o = td->eo; + int busy_poll = o->busy_poll; + unsigned int events = 0; + struct io_u *u; + struct fio_rados_iou *fri; + unsigned int i; + rados_completion_t first_unfinished; + int observed_new = 0; + + /* loop through inflight ios until we find 'min' completions */ + do { + first_unfinished = NULL; + io_u_qiter(&td->io_u_all, u, i) { + if (!(u->flags & IO_U_F_FLIGHT)) + continue; + + fri = u->engine_data; + if (fri->completion) { + if (rados_aio_is_complete(fri->completion)) { + if (fri->write_op != NULL) { + rados_release_write_op(fri->write_op); + fri->write_op = NULL; + } + rados_aio_release(fri->completion); + fri->completion = NULL; + rados->aio_events[events] = u; + events++; + observed_new = 1; + } else if (first_unfinished == NULL) { + first_unfinished = fri->completion; + } + } + if (events >= max) + break; + } + if (events >= min) + return events; + if (first_unfinished == NULL || busy_poll) + continue; + + if (!observed_new) + rados_aio_wait_for_complete(first_unfinished); + } while (1); + return events; +} + +static int fio_rados_setup(struct thread_data *td) +{ + struct rados_data *rados = NULL; + int r; + /* allocate engine specific structure to deal with librados. */ + r = _fio_setup_rados_data(td, &rados); + if (r) { + log_err("fio_setup_rados_data failed.\n"); + goto cleanup; + } + td->io_ops_data = rados; + + /* Force single process mode. + */ + td->o.use_thread = 1; + + /* connect in the main thread to determine to determine + * the size of the given RADOS block device. And disconnect + * later on. + */ + r = _fio_rados_connect(td); + if (r) { + log_err("fio_rados_connect failed.\n"); + goto cleanup; + } + rados->connected = true; + + return 0; +cleanup: + fio_rados_cleanup(td); + return r; +} + +/* open/invalidate are noops. we set the FIO_DISKLESSIO flag in ioengine_ops to + prevent fio from creating the files +*/ +static int fio_rados_open(struct thread_data *td, struct fio_file *f) +{ + return 0; +} +static int fio_rados_invalidate(struct thread_data *td, struct fio_file *f) +{ + return 0; +} + +static void fio_rados_io_u_free(struct thread_data *td, struct io_u *io_u) +{ + struct fio_rados_iou *fri = io_u->engine_data; + + if (fri) { + io_u->engine_data = NULL; + fri->td = NULL; + if (fri->completion) + rados_aio_release(fri->completion); + if (fri->write_op) + rados_release_write_op(fri->write_op); + free(fri); + } +} + +static int fio_rados_io_u_init(struct thread_data *td, struct io_u *io_u) +{ + struct fio_rados_iou *fri; + fri = calloc(1, sizeof(*fri)); + fri->io_u = io_u; + fri->td = td; + io_u->engine_data = fri; + return 0; +} + +/* ioengine_ops for get_ioengine() */ +static struct ioengine_ops ioengine = { + .name = "rados", + .version = FIO_IOOPS_VERSION, + .flags = FIO_DISKLESSIO, + .setup = fio_rados_setup, + .queue = fio_rados_queue, + .getevents = fio_rados_getevents, + .event = fio_rados_event, + .cleanup = fio_rados_cleanup, + .open_file = fio_rados_open, + .invalidate = fio_rados_invalidate, + .options = options, + .io_u_init = fio_rados_io_u_init, + .io_u_free = fio_rados_io_u_free, + .option_struct_size = sizeof(struct rados_options), +}; + +static void fio_init fio_rados_register(void) +{ + register_ioengine(&ioengine); +} + +static void fio_exit fio_rados_unregister(void) +{ + unregister_ioengine(&ioengine); +} diff --git a/examples/rados.fio b/examples/rados.fio new file mode 100644 index 0000000..035cbff --- /dev/null +++ b/examples/rados.fio @@ -0,0 +1,24 @@ +###################################################################### +# Example test for the RADOS engine. +# +# Runs a 4k random write test against a RADOS via librados +# +# NOTE: Make sure you have either Ceph pool named 'rados' or change +# the pool parameter. +###################################################################### +[global] +#logging +#write_iops_log=write_iops_log +#write_bw_log=write_bw_log +#write_lat_log=write_lat_log +ioengine=rados +clientname=admin +pool=rados +busy_poll=0 +rw=randwrite +bs=4k + +[rbd_iodepth32] +iodepth=32 +size=128m +nr_files=32 diff --git a/fio.1 b/fio.1 index 91ae4a2..e488b01 100644 --- a/fio.1 +++ b/fio.1 @@ -1585,6 +1585,11 @@ size to the current block offset. \fBblocksize\fR is ignored. I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate defragment activity in request to DDIR_WRITE event. .TP +.B rados +I/O engine supporting direct access to Ceph Reliable Autonomic Distributed +Object Store (RADOS) via librados. This ioengine defines engine specific +options. +.TP .B rbd I/O engine supporting direct access to Ceph Rados Block Devices (RBD) via librbd without the need to use the kernel rbd driver. This @@ -1773,21 +1778,25 @@ after event. .RE .RE .TP -.BI (rbd)clustername \fR=\fPstr +.BI (rbd,rados)clustername \fR=\fPstr Specifies the name of the Ceph cluster. .TP .BI (rbd)rbdname \fR=\fPstr Specifies the name of the RBD. .TP -.BI (rbd)pool \fR=\fPstr -Specifies the name of the Ceph pool containing RBD. +.BI (rbd,rados)pool \fR=\fPstr +Specifies the name of the Ceph pool containing RBD or RADOS data. .TP -.BI (rbd)clientname \fR=\fPstr +.BI (rbd,rados)clientname \fR=\fPstr Specifies the username (without the 'client.' prefix) used to access the Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall be the full *type.id* string. If no type. prefix is given, fio will add 'client.' by default. .TP +.BI (rbd,rados)busy_poll \fR=\fPbool +Poll store instead of waiting for completion. Usually this provides better +throughput at cost of higher(up to 100%) CPU utilization. +.TP .BI (mtd)skip_bad \fR=\fPbool Skip operations against known bad blocks. .TP -- To unsubscribe from this list: send the line "unsubscribe fio" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html