chunkd/be-fs.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++--------- chunkd/chunkd.h | 3 + 2 files changed, 140 insertions(+), 25 deletions(-) commit d49f7c94ebce80b9bf29d04491aaabfd542eb048 Author: Jeff Garzik <jeff@xxxxxxxxxx> Date: Mon Jul 19 19:43:05 2010 -0400 chunkd: Add checksum table to on-disk format, one sum per 64k of data Signed-off-by: Jeff Garzik <jgarzik@xxxxxxxxxx> diff --git a/chunkd/be-fs.c b/chunkd/be-fs.c index 4b851a7..d714e7c 100644 --- a/chunkd/be-fs.c +++ b/chunkd/be-fs.c @@ -53,14 +53,23 @@ struct fs_obj { int in_fd; char *in_fn; off_t sendfile_ofs; + + size_t checked_bytes; + SHA_CTX checksum; + unsigned int csum_idx; + void *csum_tbl; + size_t csum_tbl_sz; + + unsigned int n_blk; }; struct be_fs_obj_hdr { char magic[4]; uint32_t key_len; uint64_t value_len; + uint32_t n_blk; - char reserved[16]; + char reserved[12]; unsigned char hash[CHD_CSUM_SZ]; char owner[128]; @@ -208,6 +217,8 @@ static struct fs_obj *fs_obj_alloc(void) obj->out_fd = -1; obj->in_fd = -1; + SHA1_Init(&obj->checksum); + return obj; } @@ -318,6 +329,17 @@ static bool key_valid(const void *key, size_t key_len) return true; } +static unsigned int fs_blk_count(uint64_t data_len) +{ + uint64_t n_blk; + + n_blk = data_len >> CHUNK_BLK_ORDER; + if (data_len & (CHUNK_BLK_SZ - 1)) + n_blk++; + + return (unsigned int) n_blk; +} + struct backend_obj *fs_obj_new(uint32_t table_id, const void *key, size_t key_len, uint64_t data_len, @@ -325,6 +347,7 @@ struct backend_obj *fs_obj_new(uint32_t table_id, { struct fs_obj *obj; char *fn = NULL; + size_t csum_bytes; enum chunk_errcode erc = che_InternalError; off_t skip_len; @@ -339,6 +362,13 @@ struct backend_obj *fs_obj_new(uint32_t table_id, return NULL; } + obj->n_blk = fs_blk_count(data_len); + csum_bytes = obj->n_blk * CHD_CSUM_SZ; + obj->csum_tbl = malloc(csum_bytes); + if (!obj->csum_tbl) + goto err_out; + obj->csum_tbl_sz = csum_bytes; + /* build local fs pathname */ fn = fs_obj_pathname(table_id, key, key_len); if (!fn) @@ -359,7 +389,7 @@ struct backend_obj *fs_obj_new(uint32_t table_id, obj->out_fn = fn; /* calculate size of front-of-file metadata area */ - skip_len = sizeof(struct be_fs_obj_hdr) + key_len; + skip_len = sizeof(struct be_fs_obj_hdr) + key_len + csum_bytes; /* position file pointer where object data (as in, not metadata) * will begin @@ -397,7 +427,10 @@ struct backend_obj *fs_obj_open(uint32_t table_id, const char *user, struct be_fs_obj_hdr hdr; ssize_t rrc; uint64_t value_len, tmp64; + size_t csum_bytes; enum chunk_errcode erc = che_InternalError; + struct iovec iov[2]; + size_t total_rd_len; if (!key_valid(key, key_len)) { *err_code = che_InvalidKey; @@ -457,23 +490,45 @@ struct backend_obj *fs_obj_open(uint32_t table_id, const char *user, goto err_out; value_len = GUINT64_FROM_LE(hdr.value_len); + obj->n_blk = GUINT32_FROM_LE(hdr.n_blk); + csum_bytes = obj->n_blk * CHD_CSUM_SZ; /* verify file size large enough to contain value */ - tmp64 = value_len + sizeof(hdr) + key_len; + tmp64 = value_len + sizeof(hdr) + key_len + csum_bytes; if (G_UNLIKELY(st.st_size < tmp64)) { applog(LOG_ERR, "obj(%s) size error, too small", obj->in_fn); goto err_out; } + /* verify expected size of checksum table */ + if (G_UNLIKELY(fs_blk_count(value_len) != obj->n_blk)) { + applog(LOG_ERR, "obj(%s) unexpected blk count " + "(%u from val sz, %u from hdr)", + obj->in_fn, fs_blk_count(value_len), obj->n_blk); + goto err_out; + } + + obj->csum_tbl = malloc(csum_bytes); + if (!obj->csum_tbl) + goto err_out; + obj->csum_tbl_sz = csum_bytes; + obj->bo.key = malloc(key_len); obj->bo.key_len = key_len; if (!obj->bo.key) goto err_out; - /* read object variable-length header */ - rrc = read(obj->in_fd, obj->bo.key, key_len); - if ((rrc != key_len) || (memcmp(key, obj->bo.key, key_len))) { - applog(LOG_ERR, "read hdr key obj(%s) failed: %s", + /* init additional header segment list */ + iov[0].iov_base = obj->bo.key; + iov[0].iov_len = key_len; + iov[1].iov_base = obj->csum_tbl; + iov[1].iov_len = csum_bytes; + total_rd_len = iov[0].iov_len + iov[1].iov_len; + + /* read additional header segments (key, checksum table) */ + rrc = readv(obj->in_fd, iov, ARRAY_SIZE(iov)); + if ((rrc != total_rd_len) || (memcmp(key, obj->bo.key, key_len))) { + applog(LOG_ERR, "read addnl hdrs(%s) failed: %s", obj->in_fn, (rrc < 0) ? strerror(errno) : "<unknown reasons>"); goto err_out; @@ -516,6 +571,7 @@ void fs_obj_free(struct backend_obj *bo) if (obj->in_fd >= 0) close(obj->in_fd); + free(obj->csum_tbl); free(obj); } @@ -532,19 +588,58 @@ ssize_t fs_obj_read(struct backend_obj *bo, void *ptr, size_t len) return rc; } +static void obj_flush_csum(struct backend_obj *bo) +{ + struct fs_obj *obj = bo->private; + unsigned char md[CHD_CSUM_SZ]; + + if (G_UNLIKELY(obj->csum_idx >= obj->n_blk)) { + applog(LOG_ERR, "BUG %s: cidx %u, n_blk %u", + __func__, obj->csum_idx, obj->n_blk); + return; + } + + SHA1_Final(md, &obj->checksum); + + memcpy(obj->csum_tbl + ((obj->csum_idx++) * CHD_CSUM_SZ), + md, CHD_CSUM_SZ); + + obj->checked_bytes = 0; + SHA1_Init(&obj->checksum); +} + ssize_t fs_obj_write(struct backend_obj *bo, const void *ptr, size_t len) { struct fs_obj *obj = bo->private; - ssize_t rc; + ssize_t total_written = 0; - rc = write(obj->out_fd, ptr, len); - if (rc < 0) - applog(LOG_ERR, "obj write(%s) failed: %s", - obj->out_fn, strerror(errno)); - else - obj->written_bytes += rc; + while (len > 0) { + size_t unchecked; + ssize_t wrc; - return rc; + unchecked = CHUNK_BLK_SZ - obj->checked_bytes; + + wrc = write(obj->out_fd, ptr, MIN(unchecked, len)); + if (wrc < 0) { + applog(LOG_ERR, "obj write(%s) failed: %s", + obj->out_fn, strerror(errno)); + return wrc; + } + + SHA1_Update(&obj->checksum, ptr, wrc); + + total_written += wrc; + obj->written_bytes += wrc; + obj->checked_bytes += wrc; + ptr += wrc; + len -= wrc; + + /* if at end of 64k block, update csum table with new csum */ + if (obj->checked_bytes == CHUNK_BLK_SZ) + obj_flush_csum(bo); + } + + return total_written; } #if defined(HAVE_SENDFILE) && defined(__linux__) @@ -554,10 +649,11 @@ ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len) struct fs_obj *obj = bo->private; ssize_t rc; - if (obj->sendfile_ofs == 0) { - obj->sendfile_ofs += sizeof(struct be_fs_obj_hdr); - obj->sendfile_ofs += bo->key_len; - } + if (obj->sendfile_ofs == 0) + obj->sendfile_ofs = + sizeof(struct be_fs_obj_hdr) + + bo->key_len + + obj->csum_tbl_sz; rc = sendfile(out_fd, obj->in_fd, &obj->sendfile_ofs, len); if (rc < 0) @@ -575,10 +671,11 @@ ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len) ssize_t rc; off_t sbytes = 0; - if (obj->sendfile_ofs == 0) { - obj->sendfile_ofs += sizeof(struct be_fs_obj_hdr); - obj->sendfile_ofs += bo->key_len; - } + if (obj->sendfile_ofs == 0) + obj->sendfile_ofs = + sizeof(struct be_fs_obj_hdr) + + bo->key_len + + obj->csum_tbl_sz; rc = sendfile(obj->in_fd, out_fd, obj->sendfile_ofs, len, NULL, &sbytes, 0); @@ -610,7 +707,7 @@ bool fs_obj_write_commit(struct backend_obj *bo, const char *user, struct be_fs_obj_hdr hdr; ssize_t wrc; size_t total_wr_len; - struct iovec iov[2]; + struct iovec iov[3]; if (G_UNLIKELY(obj->bo.size != obj->written_bytes)) { applog(LOG_ERR, "BUG(%s): size/written_bytes mismatch: %llu/%llu", @@ -626,6 +723,19 @@ bool fs_obj_write_commit(struct backend_obj *bo, const char *user, strncpy(hdr.owner, user, sizeof(hdr.owner)); hdr.key_len = GUINT32_TO_LE(bo->key_len); hdr.value_len = GUINT64_TO_LE(obj->written_bytes); + hdr.n_blk = GUINT32_TO_LE(obj->n_blk); + + /* update checksum table with final csum, if necessary */ + if (obj->checked_bytes > 0) + obj_flush_csum(bo); + + if (G_UNLIKELY(obj->csum_idx != obj->n_blk)) { + applog(LOG_ERR, "BUG(%s): csum_idx/n_blk mismatch: %u/%u", + obj->out_fn, obj->csum_idx, obj->n_blk); + return false; + } + + obj->csum_idx = 0; /* go back to beginning of file */ if (lseek(obj->out_fd, 0, SEEK_SET) < 0) { @@ -639,7 +749,9 @@ bool fs_obj_write_commit(struct backend_obj *bo, const char *user, iov[0].iov_len = sizeof(hdr); iov[1].iov_base = bo->key; iov[1].iov_len = bo->key_len; - total_wr_len = iov[0].iov_len + iov[1].iov_len; + iov[2].iov_base = obj->csum_tbl; + iov[2].iov_len = obj->csum_tbl_sz; + total_wr_len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; /* write object header segments */ wrc = writev(obj->out_fd, iov, ARRAY_SIZE(iov)); diff --git a/chunkd/chunkd.h b/chunkd/chunkd.h index 72833f7..e73634b 100644 --- a/chunkd/chunkd.h +++ b/chunkd/chunkd.h @@ -36,6 +36,9 @@ #endif enum { + CHUNK_BLK_ORDER = 16, /* 64k blocks */ + CHUNK_BLK_SZ = 1 << CHUNK_BLK_ORDER, + CLI_DATA_BUF_SZ = 16 * 1024, CHD_TRASH_MAX = 1000, -- To unsubscribe from this list: send the line "unsubscribe hail-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html