On Mon, 25 Jul 2011, Kevin Wolf wrote:
Looks okay, except that in case of a crash you'll most likely corrupt the image because the order in which refcounts and mapping are written out is completely undefined. For a reliable implementation you need to make sure that for cluster allocation you first write out the refcount update, then fsync, then write out the mapping. Otherwise if the mapping is written out first and then a crash happens, you'll have clusters that are used, but marked free, so that in the next run a second cluster can be mapped to the same location.
Does this look better? Pekka
From 4bc5a97711094d50ec4a25ba65b01b204e986574 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@xxxxxxxxxx> Date: Thu, 21 Jul 2011 12:04:39 +0300 Subject: [PATCH v2] kvm tools, qcow: Add support for writing to zero refcount clusters This patch adds support for writing to zero refcount clusters. Refcount blocks are cached in like L2 tables and flushed upon VIRTIO_BLK_T_FLUSH and when evicted from the LRU cache. With this patch applied, 'qemu-img check' no longer complains about referenced clusters with zero reference count after dd if=/dev/zero of=/mnt/tmp where '/mnt' is freshly generated QCOW2 image. Cc: Asias He <asias.hejun@xxxxxxxxx> Cc: Cyrill Gorcunov <gorcunov@xxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxx> Cc: Kevin Wolf <kwolf@xxxxxxxxxx> Cc: Prasad Joshi <prasadjoshi124@xxxxxxxxx> Cc: Sasha Levin <levinsasha928@xxxxxxxxx> Signed-off-by: Pekka Enberg <penberg@xxxxxxxxxx> --- v1 -> v2: Use fdatasync() after writing out refcount tables in qcow_disk_flush(). tools/kvm/disk/qcow.c | 270 ++++++++++++++++++++++++++++++++++++++++-- tools/kvm/include/kvm/qcow.h | 24 ++++ 2 files changed, 286 insertions(+), 8 deletions(-) diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c index 2ef1ecb..2471aeb 100644 --- a/tools/kvm/disk/qcow.c +++ b/tools/kvm/disk/qcow.c @@ -382,6 +382,189 @@ static u64 qcow_write_l2_table(struct qcow *q, u64 *table) return off; } +static void refcount_table_free_cache(struct qcow_refcount_table *rft) +{ + struct rb_root *r = &rft->root; + struct list_head *pos, *n; + struct qcow_refcount_block *t; + + list_for_each_safe(pos, n, &rft->lru_list) { + list_del(pos); + t = list_entry(pos, struct qcow_refcount_block, list); + rb_erase(&t->node, r); + + free(t); + } +} + +static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new) +{ + struct rb_node **link = &(root->rb_node), *parent = NULL; + u64 offset = new->offset; + + /* search the tree */ + while (*link) { + struct qcow_refcount_block *t; + + t = rb_entry(*link, struct qcow_refcount_block, node); + if (!t) + goto error; + + parent = *link; + + if (t->offset > offset) + link = &(*link)->rb_left; + else if (t->offset < offset) + link = &(*link)->rb_right; + else + goto out; + } + + /* add new node */ + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); +out: + return 0; +error: + return -1; +} + +static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb) +{ + if (!rfb->dirty) + return 0; + + if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb->offset) < 0) + return -1; + + rfb->dirty = 0; + + return 0; +} + +static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c) +{ + struct qcow_refcount_table *rft = &q->refcount_table; + struct rb_root *r = &rft->root; + struct qcow_refcount_block *lru; + + if (rft->nr_cached == MAX_CACHE_NODES) { + lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list); + + if (write_refcount_block(q, lru) < 0) + goto error; + + rb_erase(&lru->node, r); + list_del_init(&lru->list); + rft->nr_cached--; + + free(lru); + } + + if (refcount_block_insert(r, c) < 0) + goto error; + + list_add_tail(&c->list, &rft->lru_list); + rft->nr_cached++; + + return 0; +error: + return -1; +} + +static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_block *rfb; + u64 cluster_size; + + cluster_size = 1 << header->cluster_bits; + + rfb = malloc(sizeof *rfb + cluster_size); + if (!rfb) + return NULL; + + rfb->offset = rfb_offset; + rfb->size = cluster_size / sizeof(u16); + RB_CLEAR_NODE(&rfb->node); + INIT_LIST_HEAD(&rfb->list); + + return rfb; +} + +static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset) +{ + struct rb_node *link = root->rb_node; + + while (link) { + struct qcow_refcount_block *t; + + t = rb_entry(link, struct qcow_refcount_block, node); + if (!t) + goto out; + + if (t->offset > offset) + link = link->rb_left; + else if (t->offset < offset) + link = link->rb_right; + else + return t; + } +out: + return NULL; +} + +static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset) +{ + struct qcow_refcount_table *rft = &q->refcount_table; + struct qcow_refcount_block *rfb; + + rfb = refcount_block_lookup(&rft->root, offset); + if (!rfb) + return NULL; + + /* Update the LRU state, by moving the searched node to list tail */ + list_move_tail(&rfb->list, &rft->lru_list); + + return rfb; +} + +static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + struct qcow_refcount_block *rfb; + u64 rfb_offset; + u64 rft_idx; + + rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT); + if (rft_idx >= rft->rf_size) + return NULL; + + rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]); + + rfb = refcount_block_search(q, rfb_offset); + if (rfb) + return rfb; + + rfb = new_refcount_block(q, rfb_offset); + if (!rfb) + return NULL; + + if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0) + goto error_free_rfb; + + if (cache_refcount_block(q, rfb) < 0) + goto error_free_rfb; + + return rfb; + +error_free_rfb: + free(rfb); + + return NULL; +} + /* * QCOW file might grow during a write operation. Not only data but metadata is * also written at the end of the file. Therefore it is necessary to ensure @@ -398,6 +581,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src struct qcow_l1_table *l1t = &q->table; struct qcow_l2_table *l2t; u64 clust_start; + u64 clust_flags; u64 l2t_offset; u64 clust_off; u64 l2t_size; @@ -435,7 +619,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src goto error; } if (!(l2t_offset & QCOW_OFLAG_COPIED)) { - pr_warning("copy-on-write clusters are not supported"); + pr_warning("L2 copy-on-write clusters are not supported"); goto error; } @@ -476,23 +660,54 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src if (!f_sz) goto error; - clust_start = be64_to_cpu(l2t->table[l2t_idx]); - if (clust_start & QCOW_OFLAG_COMPRESSED) { + clust_start = be64_to_cpu(l2t->table[l2t_idx]); + + clust_flags = clust_start & QCOW_OFLAGS_MASK; + if (clust_flags & QCOW_OFLAG_COMPRESSED) { pr_warning("compressed clusters are not supported"); goto error; } - if (!(clust_start & QCOW_OFLAG_COPIED)) { - pr_warning("copy-on-write clusters are not supported"); - goto error; - } clust_start &= QCOW_OFFSET_MASK; if (!clust_start) { clust_start = ALIGN(f_sz, clust_sz); - l2t->table[l2t_idx] = cpu_to_be64(clust_start); + l2t->table[l2t_idx] = cpu_to_be64(clust_start | QCOW_OFLAG_COPIED); l2t->dirty = 1; } + if (!(clust_flags & QCOW_OFLAG_COPIED)) { + struct qcow_refcount_block *rfb = NULL; + u16 clust_refcount; + u64 clust_idx; + u64 rfb_idx; + + clust_idx = (clust_start & QCOW_OFFSET_MASK) >> (header->cluster_bits); + + rfb = qcow_read_refcount_block(q, clust_idx); + if (!rfb) { + pr_warning("L1: error while reading refcount table"); + goto error; + } + + rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); + if (rfb_idx >= rfb->size) { + pr_warning("L1: refcount block index out of bounds"); + goto error; + } + + clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]); + if (!clust_refcount) { + clust_refcount = 1; + rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount); + rfb->dirty = 1; + } + + if (clust_refcount > 1) { + pr_warning("L1 copy-on-write clusters are not supported"); + goto error; + } + } + mutex_unlock(&q->mutex); /* Write actual data */ @@ -547,15 +762,27 @@ static ssize_t qcow_nowrite_sector(struct disk_image *disk, u64 sector, void *sr static int qcow_disk_flush(struct disk_image *disk) { struct qcow *q = disk->priv; + struct qcow_refcount_table *rft; struct qcow_header *header; struct list_head *pos, *n; struct qcow_l1_table *l1t; header = q->header; l1t = &q->table; + rft = &q->refcount_table; mutex_lock(&q->mutex); + list_for_each_safe(pos, n, &rft->lru_list) { + struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list); + + if (write_refcount_block(q, c) < 0) + goto error_unlock; + } + + if (fdatasync(disk->fd) < 0) + goto error_unlock; + list_for_each_safe(pos, n, &l1t->lru_list) { struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list); @@ -587,7 +814,9 @@ static int qcow_disk_close(struct disk_image *disk) q = disk->priv; + refcount_table_free_cache(&q->refcount_table); l1_table_free_cache(&q->table); + free(q->refcount_table.rf_table); free(q->table.l1_table); free(q->header); free(q); @@ -608,6 +837,26 @@ static struct disk_image_operations qcow_disk_ops = { .close = qcow_disk_close, }; +static int qcow_read_refcount_table(struct qcow *q) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + u64 cluster_size; + + cluster_size = 1 << header->cluster_bits; + + rft->rf_size = (header->refcount_table_size * cluster_size) / sizeof(u64); + + rft->rf_table = calloc(rft->rf_size, sizeof(u64)); + if (!rft->rf_table) + return -1; + + rft->root = RB_ROOT; + INIT_LIST_HEAD(&rft->lru_list); + + return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset); +} + static int qcow_read_l1_table(struct qcow *q) { struct qcow_header *header = q->header; @@ -656,6 +905,8 @@ static void *qcow2_read_header(int fd) .l1_size = f_header.l1_size, .cluster_bits = f_header.cluster_bits, .l2_bits = f_header.cluster_bits - 3, + .refcount_table_offset = f_header.refcount_table_offset, + .refcount_table_size = f_header.refcount_table_clusters, }; return header; @@ -687,6 +938,9 @@ static struct disk_image *qcow2_probe(int fd, bool readonly) if (qcow_read_l1_table(q) < 0) goto error; + if (qcow_read_refcount_table(q) < 0) + goto error; + /* * Do not use mmap use read/write instead */ diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h index 4ddc2b2..46db702 100644 --- a/tools/kvm/include/kvm/qcow.h +++ b/tools/kvm/include/kvm/qcow.h @@ -40,10 +40,32 @@ struct qcow_l1_table { int nr_cached; }; +#define QCOW_REFCOUNT_BLOCK_SHIFT 1 + +struct qcow_refcount_block { + u64 offset; + struct rb_node node; + struct list_head list; + u64 size; + u8 dirty; + u16 entries[]; +}; + +struct qcow_refcount_table { + u32 rf_size; + u64 *rf_table; + + /* Refcount block caching data structures */ + struct rb_root root; + struct list_head lru_list; + int nr_cached; +}; + struct qcow { pthread_mutex_t mutex; void *header; struct qcow_l1_table table; + struct qcow_refcount_table refcount_table; int fd; }; @@ -53,6 +75,8 @@ struct qcow_header { u32 l1_size; u8 cluster_bits; u8 l2_bits; + u64 refcount_table_offset; + u32 refcount_table_size; }; struct qcow1_header_disk { -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html