Am 14.05.2010 11:51, schrieb MORITA Kazutaka: > Sheepdog is a distributed storage system for QEMU. It provides highly > available block level storage volumes to VMs like Amazon EBS. This > patch adds a qemu block driver for Sheepdog. > > Sheepdog features are: > - No node in the cluster is special (no metadata node, no control > node, etc) > - Linear scalability in performance and capacity > - No single point of failure > - Autonomous management (zero configuration) > - Useful volume management support such as snapshot and cloning > - Thin provisioning > - Autonomous load balancing > > The more details are available at the project site: > http://www.osrg.net/sheepdog/ > > Signed-off-by: MORITA Kazutaka <morita.kazutaka@xxxxxxxxxxxxx> Once we solved the image creation thing, I think I'm going to be happy with the block interface. Of course, this is something that doesn't even directly affect the driver code, just the way it is used. I have no clue about the Sheepdog protocol, so I'm just trying to comment on some general details. > --- > Makefile.objs | 2 +- > block/sheepdog.c | 1831 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 1832 insertions(+), 1 deletions(-) > create mode 100644 block/sheepdog.c > > diff --git a/Makefile.objs b/Makefile.objs > index ecdd53e..6edbc57 100644 > --- a/Makefile.objs > +++ b/Makefile.objs > @@ -14,7 +14,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o > > block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o > block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o > -block-nested-y += parallels.o nbd.o blkdebug.o > +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o > block-nested-$(CONFIG_WIN32) += raw-win32.o > block-nested-$(CONFIG_POSIX) += raw-posix.o > block-nested-$(CONFIG_CURL) += curl.o > diff --git a/block/sheepdog.c b/block/sheepdog.c > new file mode 100644 > index 0000000..adf3a71 > --- /dev/null > +++ b/block/sheepdog.c > @@ -0,0 +1,1831 @@ > +/* > + * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License version > + * 2 as published by the Free Software Foundation. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program. If not, see <http://www.gnu.org/licenses/>. > + */ > +#include <netdb.h> > +#include <netinet/tcp.h> > + > +#include "qemu-common.h" > +#include "block_int.h" > + > +#define SD_PROTO_VER 0x01 > + > +#define SD_DEFAULT_ADDR "localhost:7000" > + > +#define SD_OP_CREATE_AND_WRITE_OBJ 0x01 > +#define SD_OP_READ_OBJ 0x02 > +#define SD_OP_WRITE_OBJ 0x03 > + > +#define SD_OP_NEW_VDI 0x11 > +#define SD_OP_LOCK_VDI 0x12 > +#define SD_OP_RELEASE_VDI 0x13 > +#define SD_OP_GET_VDI_INFO 0x14 > +#define SD_OP_READ_VDIS 0x15 > + > +#define SD_FLAG_CMD_WRITE 0x01 > +#define SD_FLAG_CMD_COW 0x02 > + > +#define SD_RES_SUCCESS 0x00 /* Success */ > +#define SD_RES_UNKNOWN 0x01 /* Unknown error */ > +#define SD_RES_NO_OBJ 0x02 /* No object found */ > +#define SD_RES_EIO 0x03 /* I/O error */ > +#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */ > +#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ > +#define SD_RES_SYSTEM_ERROR 0x06 /* System error */ > +#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */ > +#define SD_RES_NO_VDI 0x08 /* No vdi found */ > +#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */ > +#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */ > +#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */ > +#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */ > +#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */ > +#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ > +#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ > +#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */ > +#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ > +#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ > +#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */ > +#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ > +#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ > +#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Sheepdog is waiting for a format operation */ > +#define SD_RES_WAIT_FOR_JOIN 0x17 /* Sheepdog is waiting for other nodes joining */ > +#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ > + > +/* > + * Object ID rules > + * > + * 0 - 19 (20 bits): data object space > + * 20 - 31 (12 bits): reserved data object space > + * 32 - 55 (24 bits): vdi object space > + * 56 - 59 ( 4 bits): reserved vdi object space > + * 60 - 63 ( 4 bits): object type indentifier space > + */ > + > +#define VDI_SPACE_SHIFT 32 > +#define VDI_BIT (UINT64_C(1) << 63) > +#define VMSTATE_BIT (UINT64_C(1) << 62) > +#define MAX_DATA_OBJS (1ULL << 20) > +#define MAX_CHILDREN 1024 > +#define SD_MAX_VDI_LEN 256 > +#define SD_NR_VDIS (1U << 24) > +#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) > + > +#define SD_INODE_SIZE (sizeof(struct sd_inode)) > +#define CURRENT_VDI_ID 0 > + > +struct sd_req { > + uint8_t proto_ver; > + uint8_t opcode; > + uint16_t flags; > + uint32_t epoch; > + uint32_t id; > + uint32_t data_length; > + uint32_t opcode_specific[8]; > +}; CODING_STYLE says that structs should be typedefed and their names should be in CamelCase. So something like this: typedef struct SheepdogReq { ... } SheepdogReq; (Or, if your prefer, SDReq; but with things like SDAIOCB I think it becomes hard to read) > + > +struct sd_rsp { > + uint8_t proto_ver; > + uint8_t opcode; > + uint16_t flags; > + uint32_t epoch; > + uint32_t id; > + uint32_t data_length; > + uint32_t result; > + uint32_t opcode_specific[7]; > +}; > + > +struct sd_obj_req { > + uint8_t proto_ver; > + uint8_t opcode; > + uint16_t flags; > + uint32_t epoch; > + uint32_t id; > + uint32_t data_length; > + uint64_t oid; > + uint64_t cow_oid; > + uint32_t copies; > + uint32_t rsvd; > + uint64_t offset; > +}; > + > +struct sd_obj_rsp { > + uint8_t proto_ver; > + uint8_t opcode; > + uint16_t flags; > + uint32_t epoch; > + uint32_t id; > + uint32_t data_length; > + uint32_t result; > + uint32_t copies; > + uint32_t pad[6]; > +}; > + > +struct sd_vdi_req { > + uint8_t proto_ver; > + uint8_t opcode; > + uint16_t flags; > + uint32_t epoch; > + uint32_t id; > + uint32_t data_length; > + uint64_t vdi_size; > + uint32_t base_vdi_id; > + uint32_t copies; > + uint32_t snapid; > + uint32_t pad[3]; > +}; > + > +struct sd_vdi_rsp { > + uint8_t proto_ver; > + uint8_t opcode; > + uint16_t flags; > + uint32_t epoch; > + uint32_t id; > + uint32_t data_length; > + uint32_t result; > + uint32_t rsvd; > + uint32_t vdi_id; > + uint32_t pad[5]; > +}; > + > +struct sd_inode { > + char name[SD_MAX_VDI_LEN]; > + uint64_t ctime; > + uint64_t snap_ctime; > + uint64_t vm_clock_nsec; > + uint64_t vdi_size; > + uint64_t vm_state_size; > + uint16_t copy_policy; > + uint8_t nr_copies; > + uint8_t block_size_shift; > + uint32_t snap_id; > + uint32_t vdi_id; > + uint32_t parent_vdi_id; > + uint32_t child_vdi_id[MAX_CHILDREN]; > + uint32_t data_vdi_id[MAX_DATA_OBJS]; > +}; > + > +/* > + * 64 bit FNV-1a non-zero initial basis > + */ > +#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) > + > +/* > + * 64 bit Fowler/Noll/Vo FNV-1a hash code > + */ > +static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) > +{ > + unsigned char *bp = (unsigned char *) buf; > + unsigned char *be = bp + len; > + while (bp < be) { > + hval ^= (uint64_t) *bp++; > + hval += (hval << 1) + (hval << 4) + (hval << 5) + > + (hval << 7) + (hval << 8) + (hval << 40); > + } > + return hval; > +} > + > +static inline int is_data_obj_writeable(struct sd_inode *inode, unsigned int idx) > +{ > + return inode->vdi_id == inode->data_vdi_id[idx]; > +} > + > +static inline int is_data_obj(uint64_t oid) > +{ > + return !(VDI_BIT & oid); > +} > + > +static inline uint64_t data_oid_to_idx(uint64_t oid) > +{ > + return oid & (MAX_DATA_OBJS - 1); > +} > + > +static inline uint64_t vid_to_vdi_oid(uint32_t vid) > +{ > + return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); > +} > + > +static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) > +{ > + return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; > +} > + > +static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) > +{ > + return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; > +} > + > +#undef eprintf > +#define eprintf(fmt, args...) \ > +do { \ > + fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \ > +} while (0) What about using error_report() instead of fprintf? Though it should be the same currently. > + > +#undef dprintf > +#ifdef DEBUG_SDOG > +#define dprintf(fmt, args...) \ > +do { \ > + fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ > +} while (0) > +#else > +#define dprintf(fmt, args...) > +#endif > + > +#define min_t(type, x, y) ({ \ > + type __min1 = (x); \ > + type __min2 = (y); \ > + __min1 < __min2 ? __min1: __min2; }) > + > +struct aio_req { > + struct sd_aiocb *aiocb; > + unsigned int iov_offset; > + > + uint64_t oid; > + uint64_t base_oid; > + uint64_t offset; > + unsigned int data_len; > + uint8_t flags; > + > + QLIST_ENTRY(aio_req) pending_siblings; > + QLIST_ENTRY(aio_req) aioreq_siblings; > +}; > + > +enum aiocb_state { > + AIOCB_WRITE_UDATA, > + AIOCB_READ_UDATA, > +}; > + > +struct sd_aiocb { > + BlockDriverAIOCB common; > + > + QEMUIOVector *qiov; > + > + int64_t sector_num; > + int nb_sectors; > + > + int ret; > + enum aiocb_state aiocb_type; > + > + QEMUBH *bh; > + void (*aio_done_func)(struct sd_aiocb *); > + > + int canceled; > + > + QLIST_HEAD(aioreq_head, aio_req) aioreq_head; > +}; > + > +#define MAX_AIO_REQS 4096 > + > +struct bdrv_sd_state { > + struct sd_inode inode; > + > + int nr_dirty_data_oids; > + uint32_t dirty_data_oids[MAX_AIO_REQS]; > + > + char name[SD_MAX_VDI_LEN]; > + int is_current; > + > + char *addr; > + int fd; > + > + struct aio_req aio_req_list[MAX_AIO_REQS]; > + struct aio_req *aio_req_free[MAX_AIO_REQS]; > + int nr_aio_req_free; > + > + QLIST_HEAD(pending_head, aio_req) pending_head; > +}; > + > +static const char * sd_strerror(int err) > +{ > + int i; > + > + static const struct { > + int err; > + const char *desc; > + } errors[] = { > + {SD_RES_SUCCESS, "Success"}, > + {SD_RES_UNKNOWN, "Unknown error"}, > + {SD_RES_NO_OBJ, "No object found"}, > + {SD_RES_EIO, "I/O error"}, > + {SD_RES_VDI_EXIST, "VDI exists already"}, > + {SD_RES_INVALID_PARMS, "Invalid parameters"}, > + {SD_RES_SYSTEM_ERROR, "System error"}, > + {SD_RES_VDI_LOCKED, "VDI is already locked"}, > + {SD_RES_NO_VDI, "No vdi found"}, > + {SD_RES_NO_BASE_VDI, "No base VDI found"}, > + {SD_RES_VDI_READ, "Failed read the requested VDI"}, > + {SD_RES_VDI_WRITE, "Failed to write the requested VDI"}, > + {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"}, > + {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"}, > + {SD_RES_NO_TAG, "Failed to find the requested tag"}, > + {SD_RES_STARTUP, "The system is still booting"}, > + {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"}, > + {SD_RES_SHUTDOWN, "The system is shutting down"}, > + {SD_RES_NO_MEM, "Out of memory on the server"}, > + {SD_RES_FULL_VDI, "We already have the maximum vdis"}, > + {SD_RES_VER_MISMATCH, "Protocol version mismatch"}, > + {SD_RES_NO_SPACE, "Server has no space for new objects"}, > + {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"}, > + {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, > + {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, > + }; > + > + for (i = 0; i < ARRAY_SIZE(errors); ++i) > + if (errors[i].err == err) > + return errors[i].desc; CODING_STYLE requires braces here. > + > + return "Invalid error code"; > +} > + > +static inline int before(uint32_t seq1, uint32_t seq2) > +{ > + return (int32_t)(seq1 - seq2) < 0; > +} > + > +static inline int after(uint32_t seq1, uint32_t seq2) > +{ > + return (int32_t)(seq2 - seq1) < 0; > +} These functions look strange... Is the difference to seq1 < seq2 that the cast introduces intentional? (after(0x0, 0xabcdefff) == 1) If yes, why is this useful? This needs a comment. If no, why even bother to have this function instead of directly using < or > ? > + > +static inline struct aio_req *alloc_aio_req(struct bdrv_sd_state *s, > + struct sd_aiocb *acb, > + uint64_t oid, unsigned int data_len, > + uint64_t offset, uint8_t flags, > + uint64_t base_oid, > + unsigned int iov_offset) > +{ > + struct aio_req *aio_req; > + > + if (!s->nr_aio_req_free) > + return NULL; > + > + aio_req = s->aio_req_free[--s->nr_aio_req_free]; > + aio_req->aiocb = acb; > + aio_req->iov_offset = iov_offset; > + aio_req->oid = oid; > + aio_req->base_oid = base_oid; > + aio_req->offset = offset; > + aio_req->data_len = data_len; > + aio_req->flags = flags; > + > + QLIST_INSERT_HEAD(&s->pending_head, aio_req, pending_siblings); > + QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings); > + > + return aio_req; > +} > + > +static inline int free_aio_req(struct bdrv_sd_state *s, struct aio_req *aio_req) > +{ > + struct sd_aiocb *acb = aio_req->aiocb; > + QLIST_REMOVE(aio_req, pending_siblings); > + QLIST_REMOVE(aio_req, aioreq_siblings); > + aio_req->aiocb = NULL; > + s->aio_req_free[s->nr_aio_req_free++] = aio_req; > + > + return !QLIST_EMPTY(&acb->aioreq_head); > +} > + > +static inline int nr_outstanding_aio_req(struct bdrv_sd_state *s) > +{ > + return MAX_AIO_REQS - s->nr_aio_req_free; > +} > + > +static inline int get_id_from_req(struct bdrv_sd_state *s, struct aio_req *aio_req) > +{ > + return aio_req - s->aio_req_list; > +} > + > +static inline struct aio_req *get_req_from_id(struct bdrv_sd_state *s, int id) > +{ > + return s->aio_req_list + id; > +} > + > +static void sd_finish_aiocb(struct sd_aiocb *acb) > +{ > + if (!acb->canceled) > + acb->common.cb(acb->common.opaque, acb->ret); > + qemu_aio_release(acb); > +} > + > +static void sd_aio_cancel(BlockDriverAIOCB *blockacb) > +{ > + struct sd_aiocb *acb = (struct sd_aiocb *)blockacb; > + > + acb->canceled = 1; > +} > + > +static AIOPool sd_aio_pool = { > + .aiocb_size = sizeof(struct sd_aiocb), > + .cancel = sd_aio_cancel, > +}; > + > +static struct sd_aiocb *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, > + int64_t sector_num, int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque) > +{ > + struct sd_aiocb *acb; > + > + acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque); > + > + acb->qiov = qiov; > + > + acb->sector_num = sector_num; > + acb->nb_sectors = nb_sectors; > + > + acb->aio_done_func = NULL; > + acb->canceled = 0; > + acb->bh = NULL; > + acb->ret = 0; > + QLIST_INIT(&acb->aioreq_head); > + return acb; > +} > + > +static int sd_schedule_bh(QEMUBHFunc *cb, struct sd_aiocb *acb) > +{ > + if (acb->bh) { > + eprintf("bug: %d %d\n", acb->aiocb_type, acb->aiocb_type); > + return -EIO; > + } > + > + acb->bh = qemu_bh_new(cb, acb); > + if (!acb->bh) { > + eprintf("oom: %d %d\n", acb->aiocb_type, acb->aiocb_type); > + return -EIO; > + } > + > + qemu_bh_schedule(acb->bh); > + > + return 0; > +} > + > +static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset, > + int write) > +{ > + struct msghdr msg; > + int ret, diff; > + > + memset(&msg, 0, sizeof(msg)); > + msg.msg_iov = iov; > + msg.msg_iovlen = 1; > + > + len += offset; > + > + while (iov->iov_len < len) { > + len -= iov->iov_len; > + > + iov++; > + msg.msg_iovlen++; > + } > + > + diff = iov->iov_len - len; > + iov->iov_len -= diff; > + > + while (msg.msg_iov->iov_len <= offset) { > + offset -= msg.msg_iov->iov_len; > + > + msg.msg_iov++; > + msg.msg_iovlen--; > + } > + > + msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset; > + msg.msg_iov->iov_len -= offset; > + > + if (write) > + ret = sendmsg(sockfd, &msg, 0); > + else > + ret = recvmsg(sockfd, &msg, MSG_WAITALL); > + > + msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset; > + msg.msg_iov->iov_len += offset; > + > + iov->iov_len += diff; > + return ret; > +} > + > +static int connect_to_sdog(const char *addr) > +{ > + char buf[64]; > + char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; > + char name[256], *p; > + int fd, ret; > + struct addrinfo hints, *res, *res0; > + int port = 0; > + > + if (!addr) > + addr = SD_DEFAULT_ADDR; > + > + strcpy(name, addr); > + > + p = name; > + while (*p) { > + if (*p == ':') { > + *p++ = '\0'; > + break; > + } else > + p++; > + } > + > + if (*p == '\0') { > + eprintf("cannot find a port number, %s\n", name); > + return -1; > + } > + port = strtol(p, NULL, 10); > + if (port == 0) { > + eprintf("invalid port number, %s\n", p); > + return -1; > + } > + > + memset(&hints, 0, sizeof(hints)); > + snprintf(buf, sizeof(buf), "%d", port); > + > + hints.ai_socktype = SOCK_STREAM; > + > + ret = getaddrinfo(name, buf, &hints, &res0); > + if (ret) { > + eprintf("unable to get address info %s, %m\n", name); > + return -1; > + } > + > + for (res = res0; res; res = res->ai_next) { > + ret = getnameinfo(res->ai_addr, res->ai_addrlen, > + hbuf, sizeof(hbuf), sbuf, sizeof(sbuf), > + NI_NUMERICHOST | NI_NUMERICSERV); > + if (ret) > + continue; > + > + fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); > + if (fd < 0) > + continue; > + > +reconnect: > + ret = connect(fd, res->ai_addr, res->ai_addrlen); > + if (ret < 0) { > + if (errno == EINTR) > + goto reconnect; > + break; > + } > + > + dprintf("connected to %s:%d\n", name, port); > + goto success; > + } > + fd = -1; > + eprintf("failed connect to %s:%d\n", name, port); > +success: > + freeaddrinfo(res0); > + return fd; > +} > + > +static int do_readv_writev(int sockfd, struct iovec *iov, int len, > + int iov_offset, int write) > +{ > + int ret; > +again: > + ret = do_send_recv(sockfd, iov, len, iov_offset, write); > + if (ret < 0) { > + if (errno == EINTR || errno == EAGAIN) > + goto again; > + eprintf("failed to recv a rsp, %m\n"); > + return 1; > + } > + > + iov_offset += ret; > + len -= ret; > + if (len) > + goto again; > + > + return 0; > +} > + > +static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset) > +{ > + return do_readv_writev(sockfd, iov, len, iov_offset, 0); > +} > + > +static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset) > +{ > + return do_readv_writev(sockfd, iov, len, iov_offset, 1); > +} > + > +static int do_read_write(int sockfd, void *buf, int len, int write) > +{ > + struct iovec iov; > + > + iov.iov_base = buf; > + iov.iov_len = len; > + > + return do_readv_writev(sockfd, &iov, len, 0, write); > +} > + > +static int do_read(int sockfd, void *buf, int len) > +{ > + return do_read_write(sockfd, buf, len, 0); > +} > + > +static int do_write(int sockfd, void *buf, int len) > +{ > + return do_read_write(sockfd, buf, len, 1); > +} > + > +static int send_req(int sockfd, struct sd_req *hdr, void *data, > + unsigned int *wlen) > +{ > + int ret; > + struct iovec iov[2]; > + > + iov[0].iov_base = hdr; > + iov[0].iov_len = sizeof(*hdr); > + > + if (*wlen) { > + iov[1].iov_base = data; > + iov[1].iov_len = *wlen; > + } > + > + ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0); > + if (ret) { > + eprintf("failed to send a req, %m\n"); > + ret = -1; > + } > + > + return ret; > +} > + > +static int do_req(int sockfd, struct sd_req *hdr, void *data, > + unsigned int *wlen, unsigned int *rlen) > +{ > + int ret; > + > + ret = send_req(sockfd, hdr, data, wlen); > + if (ret) { > + ret = -1; > + goto out; > + } > + > + ret = do_read(sockfd, hdr, sizeof(*hdr)); > + if (ret) { > + eprintf("failed to get a rsp, %m\n"); > + ret = -1; > + goto out; > + } > + > + if (*rlen > hdr->data_length) > + *rlen = hdr->data_length; > + > + if (*rlen) { > + ret = do_read(sockfd, data, *rlen); > + if (ret) { > + eprintf("failed to get the data, %m\n"); > + ret = -1; > + goto out; > + } > + } > + ret = 0; > +out: > + return ret; > +} > + > +static int add_aio_request(struct bdrv_sd_state *s, struct aio_req *aio_req, > + struct iovec *iov, int niov, int create, > + enum aiocb_state aiocb_type); > + > +static void send_pending_req(struct bdrv_sd_state *s, uint64_t oid, uint32_t id) > +{ > + struct aio_req *aio_req, *next; > + struct sd_aiocb *acb; > + int ret; > + > + QLIST_FOREACH_SAFE(aio_req, &s->pending_head, pending_siblings, next) { > + if (id == get_id_from_req(s, aio_req)) > + continue; > + if (aio_req->oid != oid) > + continue; > + > + acb = aio_req->aiocb; > + ret = add_aio_request(s, aio_req, acb->qiov->iov, > + acb->qiov->niov, 0, acb->aiocb_type); > + if (ret < 0) { > + eprintf("add_aio_request is faled\n"); > + free_aio_req(s, aio_req); > + if (QLIST_EMPTY(&acb->aioreq_head)) > + sd_finish_aiocb(acb); > + } > + } > +} > + > +static void aio_read_response(void *opaque) > +{ > + struct sd_obj_req hdr; > + struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; > + struct bdrv_sd_state *s = (struct bdrv_sd_state *)opaque; > + int fd = s->fd; > + int ret; > + struct aio_req *aio_req; > + struct sd_aiocb *acb; > + int rest; > + unsigned long idx; > + > + if (!nr_outstanding_aio_req(s)) > + return; > + > + ret = do_read(fd, (void *)rsp, sizeof(*rsp)); > + if (ret) { > + eprintf("failed to get the header, %m\n"); > + return; > + } > + > + aio_req = get_req_from_id(s, rsp->id); > + acb = aio_req->aiocb; > + > + switch (acb->aiocb_type) { > + case AIOCB_WRITE_UDATA: > + if (!is_data_obj(aio_req->oid)) > + break; > + idx = data_oid_to_idx(aio_req->oid); > + > + if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) { > + s->inode.data_vdi_id[idx] = s->inode.vdi_id; > + s->dirty_data_oids[s->nr_dirty_data_oids++] = idx; > + > + send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), > + rsp->id); > + } > + break; > + case AIOCB_READ_UDATA: > + ret = do_readv(fd, acb->qiov->iov, rsp->data_length, > + aio_req->iov_offset); > + if (ret) { > + eprintf("failed to get the data, %m\n"); > + return; > + } > + break; > + } > + > + if (rsp->result != SD_RES_SUCCESS) { > + acb->ret = -EIO; > + eprintf("%s\n", sd_strerror(rsp->result)); > + } > + > + rest = free_aio_req(s, aio_req); > + if (!rest) > + acb->aio_done_func(acb); > +} > + > +static int aio_flush_request(void *opaque) > +{ > + return nr_outstanding_aio_req((struct bdrv_sd_state *)opaque); > +} > + > +static int set_nonblocking(int fd) > +{ > + int ret; > + > + ret = fcntl(fd, F_GETFL); > + if (ret < 0) { > + eprintf("can't fcntl (F_GETFL), %m\n"); > + close(fd); > + } else { > + ret = fcntl(fd, F_SETFL, ret | O_NONBLOCK); > + if (ret < 0) > + eprintf("can't fcntl (O_NONBLOCK), %m\n"); > + } > + > + return ret; > +} > + > +static int set_nodelay(int fd) > +{ > + int ret, opt; > + > + opt = 1; > + ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &opt, sizeof(opt)); > + return ret; > +} > + > +static int get_sheep_fd(struct bdrv_sd_state *s) > +{ > + int ret, fd; > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) { > + eprintf("%m\n"); > + return -1; > + } > + > + ret = set_nonblocking(fd); > + if (ret) { > + eprintf("%m\n"); > + close(fd); > + return -1; > + } > + > + ret = set_nodelay(fd); > + if (ret) { > + eprintf("%m\n"); > + close(fd); > + return -1; > + } > + > + qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request, > + NULL, s); > + s->fd = fd; > + > + return fd; > +} > + > +static int parse_vdiname(struct bdrv_sd_state *s, const char *filename, > + char *vdi, int vdi_len, uint32_t *snapid) > +{ > + char *p, *q; > + int nr_sep; > + > + p = q = strdup(filename); > + > + if (!p) > + return 1; > + > + nr_sep = 0; > + while (*p) { > + if (*p == ':') > + nr_sep++; > + if (nr_sep == 2) > + break; > + p++; > + } > + > + if (nr_sep == 2) > + *p++ = '\0'; > + else > + p = q; > + > + strncpy(vdi, p, vdi_len); > + > + p = strchr(vdi, ':'); > + if (p) { > + *p++ = '\0'; > + *snapid = strtol(p, NULL, 10); > + } else > + *snapid = CURRENT_VDI_ID; /* search current vdi */ > + > + if (nr_sep == 2) > + s->addr = q; > + else { > + free(q); > + s->addr = NULL; > + } > + > + return 0; > +} > + > +static int find_vdi_name(struct bdrv_sd_state *s, char *filename, uint32_t snapid, > + uint32_t *vid, int for_snapshot) > +{ > + int ret, fd; > + struct sd_vdi_req hdr; > + struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr; > + unsigned int wlen, rlen = 0; > + char buf[SD_MAX_VDI_LEN]; > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) > + return -1; > + > + memset(&hdr, 0, sizeof(hdr)); > + snprintf(buf, sizeof(buf), "%s", filename); > + if (for_snapshot) > + hdr.opcode = SD_OP_GET_VDI_INFO; > + else > + hdr.opcode = SD_OP_LOCK_VDI; > + wlen = SD_MAX_VDI_LEN; > + hdr.proto_ver = SD_PROTO_VER; > + hdr.data_length = SD_MAX_VDI_LEN; > + hdr.snapid = snapid; > + hdr.flags = SD_FLAG_CMD_WRITE; > + > + ret = do_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen); > + if (ret) { > + ret = -1; > + goto out; > + } > + > + if (rsp->result != SD_RES_SUCCESS) { > + eprintf("%s, %s\n", sd_strerror(rsp->result), filename); > + ret = -1; > + goto out; > + } > + *vid = rsp->vdi_id; > + > + ret = 0; > +out: > + close(fd); > + return ret; > +} > + > +static int add_aio_request(struct bdrv_sd_state *s, struct aio_req *aio_req, > + struct iovec *iov, int niov, int create, > + enum aiocb_state aiocb_type) > +{ > + int nr_copies = s->inode.nr_copies; > + struct sd_obj_req hdr; > + unsigned int wlen; > + int ret, opt; > + uint64_t oid = aio_req->oid; > + unsigned int datalen = aio_req->data_len; > + uint64_t offset = aio_req->offset; > + uint8_t flags = aio_req->flags; > + uint64_t old_oid = aio_req->base_oid; > + > + if (!nr_copies) > + eprintf("bug\n"); > + > + memset(&hdr, 0, sizeof(hdr)); > + > + if (aiocb_type == AIOCB_READ_UDATA) { > + wlen = 0; > + hdr.opcode = SD_OP_READ_OBJ; > + hdr.flags = flags; > + } else if (create) { > + wlen = datalen; > + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; > + hdr.flags = SD_FLAG_CMD_WRITE | flags; > + } else { > + wlen = datalen; > + hdr.opcode = SD_OP_WRITE_OBJ; > + hdr.flags = SD_FLAG_CMD_WRITE | flags; > + } > + > + hdr.oid = oid; > + hdr.cow_oid = old_oid; > + hdr.copies = s->inode.nr_copies; > + > + hdr.data_length = datalen; > + hdr.offset = offset; > + > + hdr.id = get_id_from_req(s, aio_req); > + > + opt = 1; > + setsockopt(s->fd, SOL_TCP, TCP_CORK, &opt, sizeof(opt)); > + > + ret = do_write(s->fd, &hdr, sizeof(hdr)); > + if (ret) { > + eprintf("failed to send a req, %m\n"); > + return -EIO; > + } > + > + if (wlen) { > + ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset); > + if (ret) { > + eprintf("failed to send a data, %m\n"); > + return -EIO; > + } > + } > + opt = 0; > + setsockopt(s->fd, SOL_TCP, TCP_CORK, &opt, sizeof(opt)); > + > + return 0; > +} > + > +static int read_write_object(int fd, char *buf, uint64_t oid, int copies, > + unsigned int datalen, uint64_t offset, > + int write, int create) > +{ > + struct sd_obj_req hdr; > + struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; > + unsigned int wlen, rlen; > + int ret; > + > + memset(&hdr, 0, sizeof(hdr)); > + > + if (write) { > + wlen = datalen; > + rlen = 0; > + hdr.flags = SD_FLAG_CMD_WRITE; > + if (create) > + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; > + else > + hdr.opcode = SD_OP_WRITE_OBJ; > + } else { > + wlen = 0; > + rlen = datalen; > + hdr.opcode = SD_OP_READ_OBJ; > + } > + hdr.oid = oid; > + hdr.data_length = datalen; > + hdr.offset = offset; > + hdr.copies = copies; > + > + ret = do_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen); > + if (ret) { > + eprintf("failed to send a request to the sheep\n"); > + return -1; > + } > + > + switch (rsp->result) { > + case SD_RES_SUCCESS: > + return 0; > + default: > + eprintf("%s\n", sd_strerror(rsp->result)); > + return -1; > + } > +} > + > +static int read_object(int fd, char *buf, uint64_t oid, int copies, > + unsigned int datalen, uint64_t offset) > +{ > + return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0); > +} > + > +static int write_object(int fd, char *buf, uint64_t oid, int copies, > + unsigned int datalen, uint64_t offset, int create) > +{ > + return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create); > +} > + > +/* TODO: error cleanups */ > +static int sd_open(BlockDriverState *bs, const char *filename, int flags) > +{ > + int ret, i, fd; > + uint32_t vid = 0; > + struct bdrv_sd_state *s = bs->opaque; > + char vdi[256]; > + uint32_t snapid; > + int for_snapshot = 0; > + char *buf; > + > + strstart(filename, "sheepdog:", (const char **)&filename); > + > + buf = qemu_malloc(SD_INODE_SIZE); > + if (!buf) { > + eprintf("Failed to allocate memory\n"); > + return -1; > + } > + > + for (i = 0; i < MAX_AIO_REQS; i++) { > + s->aio_req_free[i] = &s->aio_req_list[i]; > + s->aio_req_list[i].aiocb = NULL; > + } > + s->nr_aio_req_free = MAX_AIO_REQS; > + > + memset(vdi, 0, sizeof(vdi)); > + if (parse_vdiname(s, filename, vdi, sizeof(vdi), &snapid) < 0) > + goto out; > + s->fd = get_sheep_fd(s); > + if (s->fd < 0) > + return -1; > + > + if (snapid != CURRENT_VDI_ID) > + for_snapshot = 1; > + > + ret = find_vdi_name(s, vdi, snapid, &vid, for_snapshot); > + if (ret) > + goto out; > + > + if (snapid) > + dprintf("%" PRIx32 " non current inode was open.\n", vid); > + else > + s->is_current = 1; > + > + fd = connect_to_sdog(s->addr); I wonder why you need to open another connection here instead of using s->fd. This pattern repeats at least in the snapshot functions, so I'm sure it's there for a reason. Maybe add a comment? > + if (fd < 0) { > + eprintf("failed to connect\n"); > + goto out; > + } > + > + ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0); > + > + close(fd); > + > + if (ret) > + goto out; > + > + memcpy(&s->inode, buf, sizeof(s->inode)); > + s->nr_dirty_data_oids = 0; > + > + bs->total_sectors = s->inode.vdi_size >> 9; > + strncpy(s->name, vdi, sizeof(s->name)); > + qemu_free(buf); > + > + QLIST_INIT(&s->pending_head); > + return 0; > +out: > + qemu_free(buf); > + return -1; > +} > + > +static int do_sd_create(const char *addr, char *filename, char *tag, > + int64_t total_sectors, uint32_t base_vid, > + uint32_t *vdi_id, int snapshot) > +{ > + struct sd_vdi_req hdr; > + struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr; > + int fd, ret; > + unsigned int wlen, rlen = 0; > + char buf[SD_MAX_VDI_LEN]; > + > + fd = connect_to_sdog(addr); > + if (fd < 0) > + return -1; > + > + strncpy(buf, filename, SD_MAX_VDI_LEN); > + > + memset(&hdr, 0, sizeof(hdr)); > + hdr.opcode = SD_OP_NEW_VDI; > + hdr.base_vdi_id = base_vid; > + > + wlen = SD_MAX_VDI_LEN; > + > + hdr.flags = SD_FLAG_CMD_WRITE; > + hdr.snapid = snapshot; > + > + hdr.data_length = wlen; > + hdr.vdi_size = total_sectors * 512; > + > + ret = do_req(fd, (struct sd_req *)&hdr, buf, &wlen, &rlen); > + > + close(fd); > + > + if (ret) > + return -1; > + > + if (rsp->result != SD_RES_SUCCESS) { > + eprintf("%s, %s\n", sd_strerror(rsp->result), filename); > + return -1; > + } > + > + if (vdi_id) > + *vdi_id = rsp->vdi_id; > + > + return 0; > +} > + > +static int sd_create(const char *filename, QEMUOptionParameter *options) > +{ > + int ret; > + uint32_t vid = 0; > + int64_t total_sectors = 0; > + char *backing_file = NULL; > + > + strstart(filename, "sheepdog:", (const char **)&filename); > + > + while (options && options->name) { > + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { > + total_sectors = options->value.n / 512; > + } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { > + backing_file = options->value.s; > + } > + options++; > + } > + > + if (backing_file) { > + BlockDriverState bs; > + char vdi[SD_MAX_VDI_LEN]; > + uint32_t snapid; > + > + strstart(backing_file, "sheepdog:", (const char **)&backing_file); > + memset(&bs, 0, sizeof(bs)); > + > + bs.opaque = qemu_malloc(sizeof(struct bdrv_sd_state)); > + if (!bs.opaque) > + return -1; > + > + ret = sd_open(&bs, backing_file, 0); > + if (ret < 0) > + return -1; > + > + if (parse_vdiname(bs.opaque, backing_file, vdi, sizeof(vdi), &snapid) < 0) > + return -1; > + > + /* cannot clone from a current inode */ > + if (snapid == CURRENT_VDI_ID) > + return -1; > + > + ret = find_vdi_name(bs.opaque, vdi, snapid, &vid, 1); > + if (ret) > + return -1; > + } > + > + return do_sd_create(NULL, (char *)filename, NULL, total_sectors, vid, > + NULL, 0); > +} > + > +static void sd_close(BlockDriverState *bs) > +{ > + struct bdrv_sd_state *s = bs->opaque; > + struct sd_vdi_req hdr; > + struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr; > + unsigned int wlen, rlen = 0; > + int fd, ret; > + > + dprintf("%s\n", s->name); > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) > + return; > + > + memset(&hdr, 0, sizeof(hdr)); > + > + hdr.opcode = SD_OP_RELEASE_VDI; > + wlen = strlen(s->name) + 1; > + hdr.data_length = wlen; > + hdr.flags = SD_FLAG_CMD_WRITE; > + > + ret = do_req(fd, (struct sd_req *)&hdr, s->name, &wlen, &rlen); > + > + close(fd); > + > + if (!ret && rsp->result != SD_RES_SUCCESS && > + rsp->result != SD_RES_VDI_NOT_LOCKED) > + eprintf("%s, %s\n", sd_strerror(rsp->result), s->name); > + > + close(s->fd); > + free(s->addr); > +} > + > +static void sd_write_done(struct sd_aiocb *acb) > +{ > + int ret, i; > + struct bdrv_sd_state *s = acb->common.bs->opaque; > + struct iovec iov; > + struct aio_req *aio_req; > + uint32_t offset, data_len, mn, mx; > + > + if (s->nr_dirty_data_oids) { > + mn = mx = s->dirty_data_oids[0]; > + for (i = 0; i < s->nr_dirty_data_oids; i++) { > + if (mn > s->dirty_data_oids[i]) > + mn = s->dirty_data_oids[i]; > + if (mx < s->dirty_data_oids[i]) > + mx = s->dirty_data_oids[i]; > + } > + offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) + > + mn * sizeof(s->inode.data_vdi_id[0]); > + data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]); > + s->nr_dirty_data_oids = 0; > + > + iov.iov_base = &s->inode; > + iov.iov_len = sizeof(s->inode); > + aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), > + data_len, offset, 0, 0, offset); > + if (!aio_req) { > + eprintf("too many requests\n"); > + acb->ret = -EIO; > + goto out; > + } Randomly failing requests is probably not a good idea. The guest might decide that the disk/file system is broken and stop using it. Can't you use a list like in AIOPool, so you can dynamically add new requests as needed? > + ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA); > + if (ret) { > + free_aio_req(s, aio_req); > + acb->ret = -EIO; > + goto out; > + } > + > + acb->aio_done_func = sd_finish_aiocb; > + acb->aiocb_type = AIOCB_WRITE_UDATA; > + return; > + } > +out: > + sd_finish_aiocb(acb); > +} > + > +static int sd_create_branch(struct bdrv_sd_state *s) > +{ > + int ret, fd; > + uint32_t vid; > + char *buf; > + > + dprintf("%" PRIx32 " is not current.\n", s->inode.vdi_id); > + > + buf = qemu_malloc(SD_INODE_SIZE); > + if (!buf) > + return -1; > + > + ret = do_sd_create(s->addr, s->name, NULL, s->inode.vdi_size >> 9, > + s->inode.vdi_id, &vid, 1); > + if (ret) > + goto out; > + > + dprintf("%" PRIx32 " is created.\n", vid); > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) { > + eprintf("failed to connect\n"); > + goto out; > + } > + > + ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, > + SD_INODE_SIZE, 0); > + > + close(fd); > + > + if (ret < 0) > + goto out; > + > + memcpy(&s->inode, buf, sizeof(s->inode)); > + > + s->is_current = 1; > + ret = 0; > + dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id); > + > +out: > + qemu_free(buf); > + > + return ret; > +} > + > +static void sd_readv_writev_bh_cb(void *p) > +{ > + struct sd_aiocb *acb = p; > + int ret = 0; > + unsigned long len, done = 0, total = acb->nb_sectors * 512; > + unsigned long idx = acb->sector_num * 512 / SD_DATA_OBJ_SIZE; > + uint64_t oid; > + uint64_t offset = (acb->sector_num * 512) % SD_DATA_OBJ_SIZE; > + struct bdrv_sd_state *s = acb->common.bs->opaque; > + struct sd_inode *inode = &s->inode; > + struct aio_req *aio_req; > + > + qemu_bh_delete(acb->bh); > + acb->bh = NULL; > + > + if (acb->aiocb_type == AIOCB_WRITE_UDATA && !s->is_current) { > + ret = sd_create_branch(s); > + if (ret) { > + acb->ret = -EIO; > + goto out; > + } > + } > + > + while (done != total) { > + uint8_t flags = 0; > + uint64_t old_oid = 0; > + int create = 0; > + > + oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); > + > + len = min_t(unsigned long, total - done, SD_DATA_OBJ_SIZE - offset); > + > + if (!inode->data_vdi_id[idx]) { > + if (acb->aiocb_type == AIOCB_READ_UDATA) > + goto done; > + > + create = 1; > + } else if (acb->aiocb_type == AIOCB_WRITE_UDATA > + && !is_data_obj_writeable(inode, idx)) { > + create = 1; > + old_oid = oid; > + flags = SD_FLAG_CMD_COW; > + } > + > + if (create) { > + dprintf("update ino (%" PRIu32") %" > + PRIu64 " %" PRIu64 " %" PRIu64 "\n", > + inode->vdi_id, oid, > + vid_to_data_oid(inode->data_vdi_id[idx], idx), idx); > + oid = vid_to_data_oid(inode->vdi_id, idx); > + dprintf("new oid %lx\n", oid); > + } > + > + aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, > + old_oid, done); > + if (!aio_req) { > + eprintf("too many requests\n"); > + acb->ret = -EIO; > + goto out; > + } > + > + if (create) { > + struct aio_req *areq; > + QLIST_FOREACH(areq, &s->pending_head, pending_siblings) { > + if (get_id_from_req(s, areq) == get_id_from_req(s, aio_req)) > + continue; > + if (areq->oid == oid) { > + aio_req->flags = 0; > + aio_req->base_oid = 0; > + goto done; > + } > + } > + } > + > + ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, > + create, acb->aiocb_type); > + if (ret < 0) { > + eprintf("add_aio_request is faled\n"); > + free_aio_req(s, aio_req); > + acb->ret = -EIO; > + goto out; > + } > + done: > + offset = 0; > + idx++; > + done += len; > + } > +out: > + if (QLIST_EMPTY(&acb->aioreq_head)) > + sd_finish_aiocb(acb); > +} > + > +static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque) > +{ > + struct sd_aiocb *acb; > + > + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque); > + acb->aio_done_func = sd_write_done; > + acb->aiocb_type = AIOCB_WRITE_UDATA; > + > + sd_schedule_bh(sd_readv_writev_bh_cb, acb); > + return &acb->common; > +} > + > +static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque) > +{ > + struct sd_aiocb *acb; > + int i; > + > + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque); > + acb->aiocb_type = AIOCB_READ_UDATA; > + acb->aio_done_func = sd_finish_aiocb; > + > + /* > + * TODO: we can do better; we don't need to initialize > + * blindly. > + */ > + for (i = 0; i < qiov->niov; i++) > + memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len); > + > + sd_schedule_bh(sd_readv_writev_bh_cb, acb); > + return &acb->common; > +} > + > +static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) > +{ > + struct bdrv_sd_state *s = bs->opaque; > + int ret, fd; > + uint32_t new_vid; > + struct sd_inode *inode; > + unsigned int datalen; > + uint64_t offset; > + > + dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d " > + "is_current %d\n", sn_info->name, sn_info->id_str, > + s->name, sn_info->vm_state_size, s->is_current); > + > + if (!s->is_current) { > + eprintf("You can't create a snapshot of " > + "a non current VDI, %s (%" PRIu32 ").\n", > + s->name, s->inode.vdi_id); > + > + return -1; > + } > + > + dprintf("%s %s\n", sn_info->name, sn_info->id_str); > + > + s->inode.vm_state_size = sn_info->vm_state_size; > + s->inode.vm_clock_nsec = sn_info->vm_clock_nsec; > + offset = 0; > + /* we don't need to read entire object */ > + datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); > + > + /* refresh inode. */ > + fd = connect_to_sdog(s->addr); > + if (fd < 0) { > + ret = -EIO; > + goto cleanup; > + } > + > + ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), > + s->inode.nr_copies, datalen, offset, 0); > + if (ret < 0) { > + eprintf("failed to write snapshot's inode.\n"); > + ret = -EIO; > + goto cleanup; > + } > + > + ret = do_sd_create(s->addr, s->name, NULL, s->inode.vdi_size >> 9, > + s->inode.vdi_id, &new_vid, 1); > + if (ret < 0) { > + eprintf("failed to create inode for snapshot. %m\n"); > + ret = -EIO; > + goto cleanup; > + } > + > + inode = (struct sd_inode *)qemu_malloc(datalen); > + if (!inode) { > + eprintf("failed to allocate memory for inode. %m\n"); > + goto cleanup; > + } > + > + ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid), > + s->inode.nr_copies, datalen, offset); > + > + close(fd); > + > + if (ret < 0) { > + eprintf("failed to read new inode info. %m\n"); > + ret = -EIO; > + goto cleanup; > + } > + > + memcpy(&s->inode, inode, datalen); > + dprintf("s->inode: name %s snap_id %x oid %x\n", > + s->inode.name, s->inode.snap_id, s->inode.vdi_id); > + > +cleanup: > + close(fd); > + return ret; > +} > + > +static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) > +{ > + struct bdrv_sd_state *s = bs->opaque; > + struct bdrv_sd_state *old_s; > + char vdi[SD_MAX_VDI_LEN]; > + char *buf = NULL; > + uint32_t vid; > + uint32_t snapid = 0; > + int ret = -ENOENT, fd; > + > + old_s = qemu_malloc(sizeof(struct bdrv_sd_state)); > + if (!old_s) { qemu_malloc never returns NULL. > + eprintf("failed to allocate memory for old state. %m\n"); > + goto out; > + } > + > + memcpy(old_s, s, sizeof(struct bdrv_sd_state)); > + > + snapid = strtol(snapshot_id, NULL, 10); > + if (!snapid) { > + eprintf("Invalid snapshot_id\n"); > + goto out; > + } > + > + buf = qemu_malloc(SD_INODE_SIZE); > + if (!buf) { > + eprintf("Failed to allocate memory\n"); > + goto out; > + } > + strncpy(vdi, s->name, sizeof(vdi)); > + ret = find_vdi_name(s, vdi, snapid, &vid, 1); > + if (ret) { > + eprintf("Failed to find_vdi_name\n"); > + ret = -ENOENT; > + goto out; > + } > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) { > + eprintf("failed to connect\n"); > + goto out; > + } > + > + ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, > + SD_INODE_SIZE, 0); > + > + close(fd); > + > + if (ret) { > + ret = -ENOENT; > + goto out; > + } > + > + memcpy(&s->inode, buf, sizeof(s->inode)); > + > + if (!s->inode.vm_state_size) { > + eprintf("Invalid snapshot\n"); > + ret = -ENOENT; > + goto out; > + } > + > + s->is_current = 0; > + > + qemu_free(buf); > + qemu_free(old_s); > + > + return 0; > +out: > + /* recover bdrv_sd_state */ > + memcpy(s, old_s, sizeof(struct bdrv_sd_state)); > + qemu_free(buf); > + qemu_free(old_s); > + > + eprintf("failed to open. recover old bdrv_sd_state.\n"); > + > + return ret; > +} > + > +static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) > +{ > + /* FIXME: Delete specified snapshot id. */ > + return 0; > +} Ok, obviously there's something missing. ;-) > + > +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) > +#define BITS_PER_BYTE 8 > +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) > +#define DECLARE_BITMAP(name,bits) \ > + unsigned long name[BITS_TO_LONGS(bits)] > + > +#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) > + > +static inline int test_bit(unsigned int nr, const unsigned long *addr) > +{ > + return ((1UL << (nr % BITS_PER_LONG)) & > + (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; > +} > + > +static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) > +{ > + struct bdrv_sd_state *s = bs->opaque; > + struct sd_req req; > + int i, fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); > + QEMUSnapshotInfo *sn_tab = NULL; > + unsigned wlen, rlen; > + int found = 0; > + static struct sd_inode inode; > + unsigned long *vdi_inuse; > + unsigned int start_nr; > + > + vdi_inuse = qemu_malloc(max); > + if (!vdi_inuse) > + return 0; > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) > + goto out; > + > + rlen = max; > + wlen = 0; > + > + memset(&req, 0, sizeof(req)); > + > + req.opcode = SD_OP_READ_VDIS; > + req.data_length = max; > + > + ret = do_req(fd, (struct sd_req *)&req, vdi_inuse, &wlen, &rlen); > + > + close(fd); > + if (ret) > + goto out; > + > + sn_tab = qemu_mallocz(nr * sizeof(*sn_tab)); > + if (!sn_tab) > + goto out; > + > + start_nr = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT) & (SD_NR_VDIS - 1); > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) { > + eprintf("failed to connect\n"); > + goto out; > + } > + > + /* TODO: round up */ > + for (i = start_nr; i < SD_NR_VDIS && found < nr; i++) { > + if (!test_bit(i, vdi_inuse)) > + break; > + > + /* we don't need to read entire object */ > + ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(i), > + 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0); > + > + if (ret) > + continue; > + > + if (!strcmp(inode.name, s->name) && inode.snap_ctime) { > + sn_tab[found].date_sec = inode.snap_ctime >> 32; > + sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff; > + sn_tab[found].vm_state_size = inode.vm_state_size; > + sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec; > + > + snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u", > + inode.snap_id); > + found++; > + } > + } > + > + close(fd); > +out: > + *psn_tab = sn_tab; > + > + qemu_free(vdi_inuse); > + > + return found; > +} > + > +static int do_load_save_vmstate(struct bdrv_sd_state *s, uint8_t *data, > + int64_t pos, int size, int load) > +{ > + int fd, create; > + int ret = 0; > + unsigned int data_len; > + uint64_t vmstate_oid; > + uint32_t vdi_index; > + uint64_t offset; > + > + fd = connect_to_sdog(s->addr); > + if (fd < 0) { > + ret = -EIO; > + goto cleanup; > + } > + > + while (size) { > + vdi_index = pos / SD_DATA_OBJ_SIZE; > + offset = pos % SD_DATA_OBJ_SIZE; > + > + data_len = min_t(unsigned int, size, SD_DATA_OBJ_SIZE); > + > + vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index); > + > + create = (offset == 0); > + if (load) > + ret = read_object(fd, (char *)data, vmstate_oid, > + s->inode.nr_copies, data_len, offset); > + else > + ret = write_object(fd, (char *)data, vmstate_oid, > + s->inode.nr_copies, data_len, offset, create); > + > + if (ret < 0) { > + eprintf("failed to save vmstate %m\n"); > + ret = -EIO; > + goto cleanup; > + } > + > + pos += data_len; > + size -= data_len; > + ret += data_len; > + } > +cleanup: > + close(fd); > + return ret; > +} > + > +static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data, > + int64_t pos, int size) > +{ > + struct bdrv_sd_state *s = bs->opaque; > + > + return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0); > +} > + > +static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, > + int64_t pos, int size) > +{ > + struct bdrv_sd_state *s = bs->opaque; > + > + return do_load_save_vmstate(s, data, pos, size, 1); > +} > + > + > +static QEMUOptionParameter sd_create_options[] = { > + { > + .name = BLOCK_OPT_SIZE, > + .type = OPT_SIZE, > + .help = "Virtual disk size" > + }, > + { > + .name = BLOCK_OPT_BACKING_FILE, > + .type = OPT_STRING, > + .help = "File name of a base image" > + }, > + { NULL } > +}; > + > +BlockDriver bdrv_sheepdog = { > + .format_name = "sheepdog", > + .protocol_name = "sheepdog", > + .instance_size = sizeof(struct bdrv_sd_state), > + .bdrv_file_open = sd_open, > + .bdrv_close = sd_close, > + .bdrv_create = sd_create, > + > + .bdrv_aio_readv = sd_aio_readv, > + .bdrv_aio_writev = sd_aio_writev, > + > + .bdrv_snapshot_create = sd_snapshot_create, > + .bdrv_snapshot_goto = sd_snapshot_goto, > + .bdrv_snapshot_delete = sd_snapshot_delete, > + .bdrv_snapshot_list = sd_snapshot_list, > + > + .bdrv_save_vmstate = sd_save_vmstate, > + .bdrv_load_vmstate = sd_load_vmstate, > + > + .create_options = sd_create_options, > +}; Please align the = to the same column, at least in each block. Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html