This patch adds support for writing to zero refcount clusters. Refcount blocks are cached in like L2 tables and flushed upon VIRTIO_BLK_T_FLUSH and when evicted from the LRU cache. With this patch applied, 'qemu-img check' no longer complains about referenced clusters with zero reference count after dd if=/dev/zero of=/mnt/tmp where '/mnt' is freshly generated QCOW2 image. Cc: Asias He <asias.hejun@xxxxxxxxx> Cc: Cyrill Gorcunov <gorcunov@xxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxx> Cc: Kevin Wolf <kwolf@xxxxxxxxxx> Cc: Prasad Joshi <prasadjoshi124@xxxxxxxxx> Cc: Sasha Levin <levinsasha928@xxxxxxxxx> Signed-off-by: Pekka Enberg <penberg@xxxxxxxxxx> --- tools/kvm/disk/qcow.c | 267 ++++++++++++++++++++++++++++++++++++++++-- tools/kvm/include/kvm/qcow.h | 24 ++++ 2 files changed, 283 insertions(+), 8 deletions(-) diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c index 2ef1ecb..7c48dfb 100644 --- a/tools/kvm/disk/qcow.c +++ b/tools/kvm/disk/qcow.c @@ -382,6 +382,189 @@ static u64 qcow_write_l2_table(struct qcow *q, u64 *table) return off; } +static void refcount_table_free_cache(struct qcow_refcount_table *rft) +{ + struct rb_root *r = &rft->root; + struct list_head *pos, *n; + struct qcow_refcount_block *t; + + list_for_each_safe(pos, n, &rft->lru_list) { + list_del(pos); + t = list_entry(pos, struct qcow_refcount_block, list); + rb_erase(&t->node, r); + + free(t); + } +} + +static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new) +{ + struct rb_node **link = &(root->rb_node), *parent = NULL; + u64 offset = new->offset; + + /* search the tree */ + while (*link) { + struct qcow_refcount_block *t; + + t = rb_entry(*link, struct qcow_refcount_block, node); + if (!t) + goto error; + + parent = *link; + + if (t->offset > offset) + link = &(*link)->rb_left; + else if (t->offset < offset) + link = &(*link)->rb_right; + else + goto out; + } + + /* add new node */ + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); +out: + return 0; +error: + return -1; +} + +static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb) +{ + if (!rfb->dirty) + return 0; + + if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb->offset) < 0) + return -1; + + rfb->dirty = 0; + + return 0; +} + +static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c) +{ + struct qcow_refcount_table *rft = &q->refcount_table; + struct rb_root *r = &rft->root; + struct qcow_refcount_block *lru; + + if (rft->nr_cached == MAX_CACHE_NODES) { + lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list); + + if (write_refcount_block(q, lru) < 0) + goto error; + + rb_erase(&lru->node, r); + list_del_init(&lru->list); + rft->nr_cached--; + + free(lru); + } + + if (refcount_block_insert(r, c) < 0) + goto error; + + list_add_tail(&c->list, &rft->lru_list); + rft->nr_cached++; + + return 0; +error: + return -1; +} + +static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_block *rfb; + u64 cluster_size; + + cluster_size = 1 << header->cluster_bits; + + rfb = malloc(sizeof *rfb + cluster_size); + if (!rfb) + return NULL; + + rfb->offset = rfb_offset; + rfb->size = cluster_size / sizeof(u16); + RB_CLEAR_NODE(&rfb->node); + INIT_LIST_HEAD(&rfb->list); + + return rfb; +} + +static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset) +{ + struct rb_node *link = root->rb_node; + + while (link) { + struct qcow_refcount_block *t; + + t = rb_entry(link, struct qcow_refcount_block, node); + if (!t) + goto out; + + if (t->offset > offset) + link = link->rb_left; + else if (t->offset < offset) + link = link->rb_right; + else + return t; + } +out: + return NULL; +} + +static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset) +{ + struct qcow_refcount_table *rft = &q->refcount_table; + struct qcow_refcount_block *rfb; + + rfb = refcount_block_lookup(&rft->root, offset); + if (!rfb) + return NULL; + + /* Update the LRU state, by moving the searched node to list tail */ + list_move_tail(&rfb->list, &rft->lru_list); + + return rfb; +} + +static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + struct qcow_refcount_block *rfb; + u64 rfb_offset; + u64 rft_idx; + + rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT); + if (rft_idx >= rft->rf_size) + return NULL; + + rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]); + + rfb = refcount_block_search(q, rfb_offset); + if (rfb) + return rfb; + + rfb = new_refcount_block(q, rfb_offset); + if (!rfb) + return NULL; + + if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0) + goto error_free_rfb; + + if (cache_refcount_block(q, rfb) < 0) + goto error_free_rfb; + + return rfb; + +error_free_rfb: + free(rfb); + + return NULL; +} + /* * QCOW file might grow during a write operation. Not only data but metadata is * also written at the end of the file. Therefore it is necessary to ensure @@ -398,6 +581,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src struct qcow_l1_table *l1t = &q->table; struct qcow_l2_table *l2t; u64 clust_start; + u64 clust_flags; u64 l2t_offset; u64 clust_off; u64 l2t_size; @@ -435,7 +619,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src goto error; } if (!(l2t_offset & QCOW_OFLAG_COPIED)) { - pr_warning("copy-on-write clusters are not supported"); + pr_warning("L2 copy-on-write clusters are not supported"); goto error; } @@ -476,23 +660,54 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src if (!f_sz) goto error; - clust_start = be64_to_cpu(l2t->table[l2t_idx]); - if (clust_start & QCOW_OFLAG_COMPRESSED) { + clust_start = be64_to_cpu(l2t->table[l2t_idx]); + + clust_flags = clust_start & QCOW_OFLAGS_MASK; + if (clust_flags & QCOW_OFLAG_COMPRESSED) { pr_warning("compressed clusters are not supported"); goto error; } - if (!(clust_start & QCOW_OFLAG_COPIED)) { - pr_warning("copy-on-write clusters are not supported"); - goto error; - } clust_start &= QCOW_OFFSET_MASK; if (!clust_start) { clust_start = ALIGN(f_sz, clust_sz); - l2t->table[l2t_idx] = cpu_to_be64(clust_start); + l2t->table[l2t_idx] = cpu_to_be64(clust_start | QCOW_OFLAG_COPIED); l2t->dirty = 1; } + if (!(clust_flags & QCOW_OFLAG_COPIED)) { + struct qcow_refcount_block *rfb = NULL; + u16 clust_refcount; + u64 clust_idx; + u64 rfb_idx; + + clust_idx = (clust_start & QCOW_OFFSET_MASK) >> (header->cluster_bits); + + rfb = qcow_read_refcount_block(q, clust_idx); + if (!rfb) { + pr_warning("L1: error while reading refcount table"); + goto error; + } + + rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); + if (rfb_idx >= rfb->size) { + pr_warning("L1: refcount block index out of bounds"); + goto error; + } + + clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]); + if (!clust_refcount) { + clust_refcount = 1; + rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount); + rfb->dirty = 1; + } + + if (clust_refcount > 1) { + pr_warning("L1 copy-on-write clusters are not supported"); + goto error; + } + } + mutex_unlock(&q->mutex); /* Write actual data */ @@ -547,15 +762,24 @@ static ssize_t qcow_nowrite_sector(struct disk_image *disk, u64 sector, void *sr static int qcow_disk_flush(struct disk_image *disk) { struct qcow *q = disk->priv; + struct qcow_refcount_table *rft; struct qcow_header *header; struct list_head *pos, *n; struct qcow_l1_table *l1t; header = q->header; l1t = &q->table; + rft = &q->refcount_table; mutex_lock(&q->mutex); + list_for_each_safe(pos, n, &rft->lru_list) { + struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list); + + if (write_refcount_block(q, c) < 0) + goto error_unlock; + } + list_for_each_safe(pos, n, &l1t->lru_list) { struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list); @@ -587,7 +811,9 @@ static int qcow_disk_close(struct disk_image *disk) q = disk->priv; + refcount_table_free_cache(&q->refcount_table); l1_table_free_cache(&q->table); + free(q->refcount_table.rf_table); free(q->table.l1_table); free(q->header); free(q); @@ -608,6 +834,26 @@ static struct disk_image_operations qcow_disk_ops = { .close = qcow_disk_close, }; +static int qcow_read_refcount_table(struct qcow *q) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + u64 cluster_size; + + cluster_size = 1 << header->cluster_bits; + + rft->rf_size = (header->refcount_table_size * cluster_size) / sizeof(u64); + + rft->rf_table = calloc(rft->rf_size, sizeof(u64)); + if (!rft->rf_table) + return -1; + + rft->root = RB_ROOT; + INIT_LIST_HEAD(&rft->lru_list); + + return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset); +} + static int qcow_read_l1_table(struct qcow *q) { struct qcow_header *header = q->header; @@ -656,6 +902,8 @@ static void *qcow2_read_header(int fd) .l1_size = f_header.l1_size, .cluster_bits = f_header.cluster_bits, .l2_bits = f_header.cluster_bits - 3, + .refcount_table_offset = f_header.refcount_table_offset, + .refcount_table_size = f_header.refcount_table_clusters, }; return header; @@ -687,6 +935,9 @@ static struct disk_image *qcow2_probe(int fd, bool readonly) if (qcow_read_l1_table(q) < 0) goto error; + if (qcow_read_refcount_table(q) < 0) + goto error; + /* * Do not use mmap use read/write instead */ diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h index 4ddc2b2..46db702 100644 --- a/tools/kvm/include/kvm/qcow.h +++ b/tools/kvm/include/kvm/qcow.h @@ -40,10 +40,32 @@ struct qcow_l1_table { int nr_cached; }; +#define QCOW_REFCOUNT_BLOCK_SHIFT 1 + +struct qcow_refcount_block { + u64 offset; + struct rb_node node; + struct list_head list; + u64 size; + u8 dirty; + u16 entries[]; +}; + +struct qcow_refcount_table { + u32 rf_size; + u64 *rf_table; + + /* Refcount block caching data structures */ + struct rb_root root; + struct list_head lru_list; + int nr_cached; +}; + struct qcow { pthread_mutex_t mutex; void *header; struct qcow_l1_table table; + struct qcow_refcount_table refcount_table; int fd; }; @@ -53,6 +75,8 @@ struct qcow_header { u32 l1_size; u8 cluster_bits; u8 l2_bits; + u64 refcount_table_offset; + u32 refcount_table_size; }; struct qcow1_header_disk { -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html