[RFC/PATCH] kvm tools, qcow: Add support for writing to zero refcount clusters

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch adds support for writing to zero refcount clusters. Refcount blocks
are cached in like L2 tables and flushed upon VIRTIO_BLK_T_FLUSH and when
evicted from the LRU cache.

With this patch applied, 'qemu-img check' no longer complains about referenced
clusters with zero reference count after

  dd if=/dev/zero of=/mnt/tmp

where '/mnt' is freshly generated QCOW2 image.

Cc: Asias He <asias.hejun@xxxxxxxxx>
Cc: Cyrill Gorcunov <gorcunov@xxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxx>
Cc: Kevin Wolf <kwolf@xxxxxxxxxx>
Cc: Prasad Joshi <prasadjoshi124@xxxxxxxxx>
Cc: Sasha Levin <levinsasha928@xxxxxxxxx>
Signed-off-by: Pekka Enberg <penberg@xxxxxxxxxx>
---
 tools/kvm/disk/qcow.c        |  267 ++++++++++++++++++++++++++++++++++++++++--
 tools/kvm/include/kvm/qcow.h |   24 ++++
 2 files changed, 283 insertions(+), 8 deletions(-)

diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
index 2ef1ecb..7c48dfb 100644
--- a/tools/kvm/disk/qcow.c
+++ b/tools/kvm/disk/qcow.c
@@ -382,6 +382,189 @@ static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
 	return off;
 }
 
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+	struct rb_root *r = &rft->root;
+	struct list_head *pos, *n;
+	struct qcow_refcount_block *t;
+
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		list_del(pos);
+		t = list_entry(pos, struct qcow_refcount_block, list);
+		rb_erase(&t->node, r);
+
+		free(t);
+	}
+}
+
+static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+
+	/* search the tree */
+	while (*link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(*link, struct qcow_refcount_block, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
+{
+	if (!rfb->dirty)
+		return 0;
+
+	if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb->offset) < 0)
+		return -1;
+
+	rfb->dirty = 0;
+
+	return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct rb_root *r = &rft->root;
+	struct qcow_refcount_block *lru;
+
+	if (rft->nr_cached == MAX_CACHE_NODES) {
+		lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
+
+		if (write_refcount_block(q, lru) < 0)
+			goto error;
+
+		rb_erase(&lru->node, r);
+		list_del_init(&lru->list);
+		rft->nr_cached--;
+
+		free(lru);
+	}
+
+	if (refcount_block_insert(r, c) < 0)
+		goto error;
+
+	list_add_tail(&c->list, &rft->lru_list);
+	rft->nr_cached++;
+
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_block *rfb;
+	u64 cluster_size;
+
+	cluster_size = 1 << header->cluster_bits;
+
+	rfb = malloc(sizeof *rfb + cluster_size);
+	if (!rfb)
+		return NULL;
+
+	rfb->offset = rfb_offset;
+	rfb->size = cluster_size / sizeof(u16);
+	RB_CLEAR_NODE(&rfb->node);
+	INIT_LIST_HEAD(&rfb->list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(link, struct qcow_refcount_block, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+
+	rfb = refcount_block_lookup(&rft->root, offset);
+	if (!rfb)
+		return NULL;
+
+	/* Update the LRU state, by moving the searched node to list tail */
+	list_move_tail(&rfb->list, &rft->lru_list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+	u64 rfb_offset;
+	u64 rft_idx;
+
+	rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
+	if (rft_idx >= rft->rf_size)
+		return NULL;
+
+	rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+
+	rfb = refcount_block_search(q, rfb_offset);
+	if (rfb)
+		return rfb;
+
+	rfb = new_refcount_block(q, rfb_offset);
+	if (!rfb)
+		return NULL;
+
+	if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
+		goto error_free_rfb;
+
+	if (cache_refcount_block(q, rfb) < 0)
+		goto error_free_rfb;
+
+	return rfb;
+
+error_free_rfb:
+	free(rfb);
+
+	return NULL;
+}
+
 /*
  * QCOW file might grow during a write operation. Not only data but metadata is
  * also written at the end of the file. Therefore it is necessary to ensure
@@ -398,6 +581,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
 	struct qcow_l1_table *l1t = &q->table;
 	struct qcow_l2_table *l2t;
 	u64 clust_start;
+	u64 clust_flags;
 	u64 l2t_offset;
 	u64 clust_off;
 	u64 l2t_size;
@@ -435,7 +619,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
 		goto error;
 	}
 	if (!(l2t_offset & QCOW_OFLAG_COPIED)) {
-		pr_warning("copy-on-write clusters are not supported");
+		pr_warning("L2 copy-on-write clusters are not supported");
 		goto error;
 	}
 
@@ -476,23 +660,54 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src
 	if (!f_sz)
 		goto error;
 
-	clust_start	= be64_to_cpu(l2t->table[l2t_idx]);
-	if (clust_start & QCOW_OFLAG_COMPRESSED) {
+	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+
+	clust_flags = clust_start & QCOW_OFLAGS_MASK;
+	if (clust_flags & QCOW_OFLAG_COMPRESSED) {
 		pr_warning("compressed clusters are not supported");
 		goto error;
 	}
-	if (!(clust_start & QCOW_OFLAG_COPIED)) {
-		pr_warning("copy-on-write clusters are not supported");
-		goto error;
-	}
 
 	clust_start &= QCOW_OFFSET_MASK;
 	if (!clust_start) {
 		clust_start		= ALIGN(f_sz, clust_sz);
-		l2t->table[l2t_idx]	= cpu_to_be64(clust_start);
+		l2t->table[l2t_idx]	= cpu_to_be64(clust_start | QCOW_OFLAG_COPIED);
 		l2t->dirty		= 1;
 	}
 
+	if (!(clust_flags & QCOW_OFLAG_COPIED)) {
+		struct qcow_refcount_block *rfb = NULL;
+		u16 clust_refcount;
+		u64 clust_idx;
+		u64 rfb_idx;
+
+		clust_idx = (clust_start & QCOW_OFFSET_MASK) >> (header->cluster_bits);
+
+		rfb = qcow_read_refcount_block(q, clust_idx);
+		if (!rfb) {
+			pr_warning("L1: error while reading refcount table");
+			goto error;
+		}
+
+		rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+		if (rfb_idx >= rfb->size) {
+			pr_warning("L1: refcount block index out of bounds");
+			goto error;
+		}
+
+		clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
+		if (!clust_refcount) {
+			clust_refcount = 1;
+			rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
+			rfb->dirty = 1;
+		}
+
+		if (clust_refcount > 1) {
+			pr_warning("L1 copy-on-write clusters are not supported");
+			goto error;
+		}
+	}
+
 	mutex_unlock(&q->mutex);
 
 	/* Write actual data */
@@ -547,15 +762,24 @@ static ssize_t qcow_nowrite_sector(struct disk_image *disk, u64 sector, void *sr
 static int qcow_disk_flush(struct disk_image *disk)
 {
 	struct qcow *q = disk->priv;
+	struct qcow_refcount_table *rft;
 	struct qcow_header *header;
 	struct list_head *pos, *n;
 	struct qcow_l1_table *l1t;
 
 	header = q->header;
 	l1t = &q->table;
+	rft = &q->refcount_table;
 
 	mutex_lock(&q->mutex);
 
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);
+
+		if (write_refcount_block(q, c) < 0)
+			goto error_unlock;
+	}
+
 	list_for_each_safe(pos, n, &l1t->lru_list) {
 		struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
 
@@ -587,7 +811,9 @@ static int qcow_disk_close(struct disk_image *disk)
 
 	q = disk->priv;
 
+	refcount_table_free_cache(&q->refcount_table);
 	l1_table_free_cache(&q->table);
+	free(q->refcount_table.rf_table);
 	free(q->table.l1_table);
 	free(q->header);
 	free(q);
@@ -608,6 +834,26 @@ static struct disk_image_operations qcow_disk_ops = {
 	.close			= qcow_disk_close,
 };
 
+static int qcow_read_refcount_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	u64 cluster_size;
+
+	cluster_size = 1 << header->cluster_bits;
+
+	rft->rf_size = (header->refcount_table_size * cluster_size) / sizeof(u64);
+
+	rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+	if (!rft->rf_table)
+		return -1;
+
+	rft->root = RB_ROOT;
+	INIT_LIST_HEAD(&rft->lru_list);
+
+	return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
+}
+
 static int qcow_read_l1_table(struct qcow *q)
 {
 	struct qcow_header *header = q->header;
@@ -656,6 +902,8 @@ static void *qcow2_read_header(int fd)
 		.l1_size		= f_header.l1_size,
 		.cluster_bits		= f_header.cluster_bits,
 		.l2_bits		= f_header.cluster_bits - 3,
+		.refcount_table_offset	= f_header.refcount_table_offset,
+		.refcount_table_size	= f_header.refcount_table_clusters,
 	};
 
 	return header;
@@ -687,6 +935,9 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
 	if (qcow_read_l1_table(q) < 0)
 		goto error;
 
+	if (qcow_read_refcount_table(q) < 0)
+		goto error;
+
 	/*
 	 * Do not use mmap use read/write instead
 	 */
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
index 4ddc2b2..46db702 100644
--- a/tools/kvm/include/kvm/qcow.h
+++ b/tools/kvm/include/kvm/qcow.h
@@ -40,10 +40,32 @@ struct qcow_l1_table {
 	int				nr_cached;
 };
 
+#define QCOW_REFCOUNT_BLOCK_SHIFT	1
+
+struct qcow_refcount_block {
+	u64				offset;
+	struct rb_node			node;
+	struct list_head		list;
+	u64				size;
+	u8				dirty;
+	u16				entries[];
+};
+
+struct qcow_refcount_table {
+	u32				rf_size;
+	u64				*rf_table;
+
+	/* Refcount block caching data structures */
+	struct rb_root			root;
+	struct list_head		lru_list;
+	int				nr_cached;
+};
+
 struct qcow {
 	pthread_mutex_t			mutex;
 	void				*header;
 	struct qcow_l1_table		table;
+	struct qcow_refcount_table	refcount_table;
 	int				fd;
 };
 
@@ -53,6 +75,8 @@ struct qcow_header {
 	u32				l1_size;
 	u8				cluster_bits;
 	u8				l2_bits;
+	u64				refcount_table_offset;
+	u32				refcount_table_size;
 };
 
 struct qcow1_header_disk {
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux