Hi, As you see from the subject, I'm getting qcow2 corruption. I have a Windows 2000 guest which boots and runs fine in kvm-72, fails with a blue-screen indicating file corruption errors in kvm-73 through to kvm-83 (the latest), and succeeds if I replace block-qcow2.c with the version from kvm-72. The blue screen appears towards the end of the boot sequence, and shows only briefly before rebooting. It says: STOP: c0000218 (Registry File Failure) The registry cannot load the hive (file): \SystemRoot\System32\Config\SOFTWARE or its log or alternate. It is corrupt, absent, or not writable. Beginning dump of physical memory Physical memory dump complete. Contact your system administrator or technical support [...?] Although there are many ways to make Windows blue screen in KVM, in this case I've narrowed it down to the difference in qemu/block-qcow2.c between kvm-72 and kvm-73 (not -83). If I build kvm-83, except with block-qcow2.c reverted back to the kvm-72 version, my guest boots and runs. since the changes to block-qcow2.c only affect how it reads and writes the format, not device emulation, that suggests a bug in the new version. >From kvm-73 to kvm-83, there have been more changes block-qcow2.c, but I still have the same symptom. The bug isn't in reading, so it must be caused by writing. When I use "qemu-img convert" to convert my qcow2 file to a raw file, with broken and fixed versions, it produces the same raw file. Also, when I use "-snapshot", kvm-83 works without the patch! So there's no evident corruption if the qcow2 file is only read. Applying the patch below to kvm-83 fixes it. You probably don't want to apply this patch as it reverts some good looking qcow2 optimisations, but take it as a big clue that the optimisations introduced a bug which shows up as a data corruption error in the guest, if some writing is done. I don't know more than that. If an official QEMU release is about to happen, this should probably be fixed before it's unleashed on existing guests :-) Regards, -- Jamie --- kvm-83-release/qemu/block-qcow2.c 2009-01-13 13:29:42.000000000 +0000 +++ kvm-83-fixed/qemu/block-qcow2.c 2009-02-11 05:37:53.000000000 +0000 @@ -52,8 +52,6 @@ #define QCOW_CRYPT_NONE 0 #define QCOW_CRYPT_AES 1 -#define QCOW_MAX_CRYPT_CLUSTERS 32 - /* indicate that the refcount of the referenced cluster is exactly one. */ #define QCOW_OFLAG_COPIED (1LL << 63) /* indicate that the cluster is compressed (they never have the copied flag) */ @@ -61,6 +59,10 @@ #define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ +#ifndef offsetof +#define offsetof(type, field) ((size_t) &((type *)0)->field) +#endif + typedef struct QCowHeader { uint32_t magic; uint32_t version; @@ -269,8 +271,7 @@ if (!s->cluster_cache) goto fail; /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size - + 512); + s->cluster_data = qemu_malloc(s->cluster_size + 512); if (!s->cluster_data) goto fail; s->cluster_cache_offset = -1; @@ -437,7 +438,8 @@ int new_l1_size, new_l1_size2, ret, i; uint64_t *new_l1_table; uint64_t new_l1_table_offset; - uint8_t data[12]; + uint64_t data64; + uint32_t data32; new_l1_size = s->l1_size; if (min_size <= new_l1_size) @@ -467,10 +469,13 @@ new_l1_table[i] = be64_to_cpu(new_l1_table[i]); /* set new table */ - cpu_to_be32w((uint32_t*)data, new_l1_size); - cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset); - if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data, - sizeof(data)) != sizeof(data)) + data64 = cpu_to_be64(new_l1_table_offset); + if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_table_offset), + &data64, sizeof(data64)) != sizeof(data64)) + goto fail; + data32 = cpu_to_be32(new_l1_size); + if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), + &data32, sizeof(data32)) != sizeof(data32)) goto fail; qemu_free(s->l1_table); free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t)); @@ -483,549 +488,169 @@ return -EIO; } -/* - * seek_l2_table - * - * seek l2_offset in the l2_cache table - * if not found, return NULL, - * if found, - * increments the l2 cache hit count of the entry, - * if counter overflow, divide by two all counters - * return the pointer to the l2 cache entry - * - */ - -static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset) -{ - int i, j; - - for(i = 0; i < L2_CACHE_SIZE; i++) { - if (l2_offset == s->l2_cache_offsets[i]) { - /* increment the hit count */ - if (++s->l2_cache_counts[i] == 0xffffffff) { - for(j = 0; j < L2_CACHE_SIZE; j++) { - s->l2_cache_counts[j] >>= 1; - } - } - return s->l2_cache + (i << s->l2_bits); - } - } - return NULL; -} - -/* - * l2_load - * - * Loads a L2 table into memory. If the table is in the cache, the cache - * is used; otherwise the L2 table is loaded from the image file. - * - * Returns a pointer to the L2 table on success, or NULL if the read from - * the image file failed. - */ - -static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset) -{ - BDRVQcowState *s = bs->opaque; - int min_index; - uint64_t *l2_table; - - /* seek if the table for the given offset is in the cache */ - - l2_table = seek_l2_table(s, l2_offset); - if (l2_table != NULL) - return l2_table; - - /* not found: load a new entry in the least used one */ - - min_index = l2_cache_new_entry(bs); - l2_table = s->l2_cache + (min_index << s->l2_bits); - if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return NULL; - s->l2_cache_offsets[min_index] = l2_offset; - s->l2_cache_counts[min_index] = 1; - - return l2_table; -} - -/* - * l2_allocate - * - * Allocate a new l2 entry in the file. If l1_index points to an already - * used entry in the L2 table (i.e. we are doing a copy on write for the L2 - * table) copy the contents of the old L2 table into the newly allocated one. - * Otherwise the new table is initialized with zeros. - * - */ - -static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index) -{ - BDRVQcowState *s = bs->opaque; - int min_index; - uint64_t old_l2_offset, tmp; - uint64_t *l2_table, l2_offset; - - old_l2_offset = s->l1_table[l1_index]; - - /* allocate a new l2 entry */ - - l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); - - /* update the L1 entry */ - - s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; - - tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED); - if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp), - &tmp, sizeof(tmp)) != sizeof(tmp)) - return NULL; - - /* allocate a new entry in the l2 cache */ - - min_index = l2_cache_new_entry(bs); - l2_table = s->l2_cache + (min_index << s->l2_bits); - - if (old_l2_offset == 0) { - /* if there was no old l2 table, clear the new table */ - memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - } else { - /* if there was an old l2 table, read it from the disk */ - if (bdrv_pread(s->hd, old_l2_offset, - l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return NULL; - } - /* write the l2 table to the file */ - if (bdrv_pwrite(s->hd, l2_offset, - l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return NULL; - - /* update the l2 cache entry */ - - s->l2_cache_offsets[min_index] = l2_offset; - s->l2_cache_counts[min_index] = 1; - - return l2_table; -} - -static int size_to_clusters(BDRVQcowState *s, int64_t size) -{ - return (size + (s->cluster_size - 1)) >> s->cluster_bits; -} - -static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, - uint64_t *l2_table, uint64_t start, uint64_t mask) -{ - int i; - uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask; - - if (!offset) - return 0; - - for (i = start; i < start + nb_clusters; i++) - if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask)) - break; - - return (i - start); -} - -static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) -{ - int i = 0; - - while(nb_clusters-- && l2_table[i] == 0) - i++; - - return i; -} - -/* - * get_cluster_offset +/* 'allocate' is: * - * For a given offset of the disk image, return cluster offset in - * qcow2 file. + * 0 not to allocate. * - * on entry, *num is the number of contiguous clusters we'd like to - * access following offset. + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') * - * on exit, *num is the number of contiguous clusters we can read. - * - * Return 1, if the offset is found - * Return 0, otherwise. + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size * + * return 0 if not allocated. */ - static uint64_t get_cluster_offset(BlockDriverState *bs, - uint64_t offset, int *num) + uint64_t offset, int allocate, + int compressed_size, + int n_start, int n_end) { BDRVQcowState *s = bs->opaque; - int l1_index, l2_index; - uint64_t l2_offset, *l2_table, cluster_offset; - int l1_bits, c; - int index_in_cluster, nb_available, nb_needed, nb_clusters; - - index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); - nb_needed = *num + index_in_cluster; - - l1_bits = s->l2_bits + s->cluster_bits; - - /* compute how many bytes there are between the offset and - * the end of the l1 entry - */ - - nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1)); - - /* compute the number of available sectors */ - - nb_available = (nb_available >> 9) + index_in_cluster; - - cluster_offset = 0; - - /* seek the the l2 offset in the l1 table */ - - l1_index = offset >> l1_bits; - if (l1_index >= s->l1_size) - goto out; - - l2_offset = s->l1_table[l1_index]; - - /* seek the l2 table of the given l2 offset */ - - if (!l2_offset) - goto out; - - /* load the l2 table in memory */ - - l2_offset &= ~QCOW_OFLAG_COPIED; - l2_table = l2_load(bs, l2_offset); - if (l2_table == NULL) - return 0; - - /* find the cluster offset for the given disk offset */ - - l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - cluster_offset = be64_to_cpu(l2_table[l2_index]); - nb_clusters = size_to_clusters(s, nb_needed << 9); - - if (!cluster_offset) { - /* how many empty clusters ? */ - c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); - } else { - /* how many allocated clusters ? */ - c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, QCOW_OFLAG_COPIED); - } - - nb_available = (c * s->cluster_sectors); -out: - if (nb_available > nb_needed) - nb_available = nb_needed; - - *num = nb_available - index_in_cluster; - - return cluster_offset & ~QCOW_OFLAG_COPIED; -} - -/* - * free_any_clusters - * - * free clusters according to its type: compressed or not - * - */ - -static void free_any_clusters(BlockDriverState *bs, - uint64_t cluster_offset, int nb_clusters) -{ - BDRVQcowState *s = bs->opaque; - - /* free the cluster */ - - if (cluster_offset & QCOW_OFLAG_COMPRESSED) { - int nb_csectors; - nb_csectors = ((cluster_offset >> s->csize_shift) & - s->csize_mask) + 1; - free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511, - nb_csectors * 512); - return; - } - - free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits); - - return; -} - -/* - * get_cluster_table - * - * for a given disk offset, load (and allocate if needed) - * the l2 table. - * - * the l2 table offset in the qcow2 file and the cluster index - * in the l2 table are given to the caller. - * - */ - -static int get_cluster_table(BlockDriverState *bs, uint64_t offset, - uint64_t **new_l2_table, - uint64_t *new_l2_offset, - int *new_l2_index) -{ - BDRVQcowState *s = bs->opaque; - int l1_index, l2_index, ret; - uint64_t l2_offset, *l2_table; - - /* seek the the l2 offset in the l1 table */ + int min_index, i, j, l1_index, l2_index, ret; + uint64_t l2_offset, *l2_table, cluster_offset, tmp, old_l2_offset; l1_index = offset >> (s->l2_bits + s->cluster_bits); if (l1_index >= s->l1_size) { - ret = grow_l1_table(bs, l1_index + 1); - if (ret < 0) + /* outside l1 table is allowed: we grow the table if needed */ + if (!allocate) + return 0; + if (grow_l1_table(bs, l1_index + 1) < 0) return 0; } l2_offset = s->l1_table[l1_index]; + if (!l2_offset) { + if (!allocate) + return 0; + l2_allocate: + old_l2_offset = l2_offset; + /* allocate a new l2 entry */ + l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; + tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED); + if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp), + &tmp, sizeof(tmp)) != sizeof(tmp)) + return 0; + min_index = l2_cache_new_entry(bs); + l2_table = s->l2_cache + (min_index << s->l2_bits); - /* seek the l2 table of the given l2 offset */ - - if (l2_offset & QCOW_OFLAG_COPIED) { - /* load the l2 table in memory */ - l2_offset &= ~QCOW_OFLAG_COPIED; - l2_table = l2_load(bs, l2_offset); - if (l2_table == NULL) + if (old_l2_offset == 0) { + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + } else { + if (bdrv_pread(s->hd, old_l2_offset, + l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } + if (bdrv_pwrite(s->hd, l2_offset, + l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) return 0; } else { - if (l2_offset) - free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t)); - l2_table = l2_allocate(bs, l1_index); - if (l2_table == NULL) + if (!(l2_offset & QCOW_OFLAG_COPIED)) { + if (allocate) { + free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t)); + goto l2_allocate; + } + } else { + l2_offset &= ~QCOW_OFLAG_COPIED; + } + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i << s->l2_bits); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = l2_cache_new_entry(bs); + l2_table = s->l2_cache + (min_index << s->l2_bits); + if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) return 0; - l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED; } - - /* find the cluster offset for the given disk offset */ - + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + found: l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - - *new_l2_table = l2_table; - *new_l2_offset = l2_offset; - *new_l2_index = l2_index; - - return 1; -} - -/* - * alloc_compressed_cluster_offset - * - * For a given offset of the disk image, return cluster offset in - * qcow2 file. - * - * If the offset is not found, allocate a new compressed cluster. - * - * Return the cluster offset if successful, - * Return 0, otherwise. - * - */ - -static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int compressed_size) -{ - BDRVQcowState *s = bs->opaque; - int l2_index, ret; - uint64_t l2_offset, *l2_table, cluster_offset; - int nb_csectors; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); - if (ret == 0) - return 0; - cluster_offset = be64_to_cpu(l2_table[l2_index]); - if (cluster_offset & QCOW_OFLAG_COPIED) - return cluster_offset & ~QCOW_OFLAG_COPIED; - - if (cluster_offset) - free_any_clusters(bs, cluster_offset, 1); - - cluster_offset = alloc_bytes(bs, compressed_size); - nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - - (cluster_offset >> 9); - - cluster_offset |= QCOW_OFLAG_COMPRESSED | - ((uint64_t)nb_csectors << s->csize_shift); - - /* update L2 table */ - - /* compressed clusters never have the copied flag */ - - l2_table[l2_index] = cpu_to_be64(cluster_offset); - if (bdrv_pwrite(s->hd, - l2_offset + l2_index * sizeof(uint64_t), - l2_table + l2_index, - sizeof(uint64_t)) != sizeof(uint64_t)) - return 0; - - return cluster_offset; -} - -typedef struct QCowL2Meta -{ - uint64_t offset; - int n_start; - int nb_available; - int nb_clusters; -} QCowL2Meta; - -static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, - QCowL2Meta *m) -{ - BDRVQcowState *s = bs->opaque; - int i, j = 0, l2_index, ret; - uint64_t *old_cluster, start_sect, l2_offset, *l2_table; - - if (m->nb_clusters == 0) - return 0; - - if (!(old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t)))) - return -ENOMEM; - - /* copy content of unmodified sectors */ - start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9; - if (m->n_start) { - ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start); - if (ret < 0) - goto err; + if (!cluster_offset) { + if (!allocate) + return cluster_offset; + } else if (!(cluster_offset & QCOW_OFLAG_COPIED)) { + if (!allocate) + return cluster_offset; + /* free the cluster */ + if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + int nb_csectors; + nb_csectors = ((cluster_offset >> s->csize_shift) & + s->csize_mask) + 1; + free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511, + nb_csectors * 512); + } else { + free_clusters(bs, cluster_offset, s->cluster_size); + } + } else { + cluster_offset &= ~QCOW_OFLAG_COPIED; + return cluster_offset; } - - if (m->nb_available & (s->cluster_sectors - 1)) { - uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1); - ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9), - m->nb_available - end, s->cluster_sectors); - if (ret < 0) - goto err; + if (allocate == 1) { + /* allocate a new cluster */ + cluster_offset = alloc_clusters(bs, s->cluster_size); + + /* we must initialize the cluster content which won't be + written */ + if ((n_end - n_start) < s->cluster_sectors) { + uint64_t start_sect; + + start_sect = (offset & ~(s->cluster_size - 1)) >> 9; + ret = copy_sectors(bs, start_sect, + cluster_offset, 0, n_start); + if (ret < 0) + return 0; + ret = copy_sectors(bs, start_sect, + cluster_offset, n_end, s->cluster_sectors); + if (ret < 0) + return 0; + } + tmp = cpu_to_be64(cluster_offset | QCOW_OFLAG_COPIED); + } else { + int nb_csectors; + cluster_offset = alloc_bytes(bs, compressed_size); + nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - + (cluster_offset >> 9); + cluster_offset |= QCOW_OFLAG_COMPRESSED | + ((uint64_t)nb_csectors << s->csize_shift); + /* compressed clusters never have the copied flag */ + tmp = cpu_to_be64(cluster_offset); } - - ret = -EIO; /* update L2 table */ - if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index)) - goto err; - - for (i = 0; i < m->nb_clusters; i++) { - if(l2_table[l2_index + i] != 0) - old_cluster[j++] = l2_table[l2_index + i]; - - l2_table[l2_index + i] = cpu_to_be64((cluster_offset + - (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); - } - - if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t), - l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) != - m->nb_clusters * sizeof(uint64_t)) - goto err; - - for (i = 0; i < j; i++) - free_any_clusters(bs, old_cluster[i], 1); - - ret = 0; -err: - qemu_free(old_cluster); - return ret; - } - -/* - * alloc_cluster_offset - * - * For a given offset of the disk image, return cluster offset in - * qcow2 file. - * - * If the offset is not found, allocate a new cluster. - * - * Return the cluster offset if successful, - * Return 0, otherwise. - * - */ - -static uint64_t alloc_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int n_start, int n_end, - int *num, QCowL2Meta *m) -{ - BDRVQcowState *s = bs->opaque; - int l2_index, ret; - uint64_t l2_offset, *l2_table, cluster_offset; - int nb_clusters, i = 0; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); - if (ret == 0) + l2_table[l2_index] = tmp; + if (bdrv_pwrite(s->hd, + l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp)) return 0; - - nb_clusters = size_to_clusters(s, n_end << 9); - - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - - cluster_offset = be64_to_cpu(l2_table[l2_index]); - - /* We keep all QCOW_OFLAG_COPIED clusters */ - - if (cluster_offset & QCOW_OFLAG_COPIED) { - nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, 0); - - cluster_offset &= ~QCOW_OFLAG_COPIED; - m->nb_clusters = 0; - - goto out; - } - - /* for the moment, multiple compressed clusters are not managed */ - - if (cluster_offset & QCOW_OFLAG_COMPRESSED) - nb_clusters = 1; - - /* how many available clusters ? */ - - while (i < nb_clusters) { - i += count_contiguous_clusters(nb_clusters - i, s->cluster_size, - &l2_table[l2_index], i, 0); - - if(be64_to_cpu(l2_table[l2_index + i])) - break; - - i += count_contiguous_free_clusters(nb_clusters - i, - &l2_table[l2_index + i]); - - cluster_offset = be64_to_cpu(l2_table[l2_index + i]); - - if ((cluster_offset & QCOW_OFLAG_COPIED) || - (cluster_offset & QCOW_OFLAG_COMPRESSED)) - break; - } - nb_clusters = i; - - /* allocate a new cluster */ - - cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size); - - /* save info needed for meta data update */ - m->offset = offset; - m->n_start = n_start; - m->nb_clusters = nb_clusters; - -out: - m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end); - - *num = m->nb_available - n_start; - return cluster_offset; } static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n; uint64_t cluster_offset; - *pnum = nb_sectors; - cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum); - + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; return (cluster_offset != 0); } @@ -1102,9 +727,11 @@ uint64_t cluster_offset; while (nb_sectors > 0) { - n = nb_sectors; - cluster_offset = get_cluster_offset(bs, sector_num << 9, &n); + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; if (!cluster_offset) { if (bs->backing_hd) { /* read from the base image */ @@ -1143,18 +770,15 @@ BDRVQcowState *s = bs->opaque; int ret, index_in_cluster, n; uint64_t cluster_offset; - int n_end; - QCowL2Meta l2meta; while (nb_sectors > 0) { index_in_cluster = sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + nb_sectors; - if (s->crypt_method && - n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) - n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; - cluster_offset = alloc_cluster_offset(bs, sector_num << 9, - index_in_cluster, - n_end, &n, &l2meta); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + n); if (!cluster_offset) return -1; if (s->crypt_method) { @@ -1165,10 +789,8 @@ } else { ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); } - if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) { - free_any_clusters(bs, cluster_offset, l2meta.nb_clusters); + if (ret != n * 512) return -1; - } nb_sectors -= n; sector_num += n; buf += n * 512; @@ -1186,33 +808,8 @@ uint64_t cluster_offset; uint8_t *cluster_data; BlockDriverAIOCB *hd_aiocb; - QEMUBH *bh; - QCowL2Meta l2meta; } QCowAIOCB; -static void qcow_aio_read_cb(void *opaque, int ret); -static void qcow_aio_read_bh(void *opaque) -{ - QCowAIOCB *acb = opaque; - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qcow_aio_read_cb(opaque, 0); -} - -static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb) -{ - if (acb->bh) - return -EIO; - - acb->bh = qemu_bh_new(cb, acb); - if (!acb->bh) - return -EIO; - - qemu_bh_schedule(acb->bh); - - return 0; -} - static void qcow_aio_read_cb(void *opaque, int ret) { QCowAIOCB *acb = opaque; @@ -1222,12 +819,13 @@ acb->hd_aiocb = NULL; if (ret < 0) { -fail: + fail: acb->common.cb(acb->common.opaque, ret); qemu_aio_release(acb); return; } + redo: /* post process the read buffer */ if (!acb->cluster_offset) { /* nothing to do */ @@ -1253,9 +851,12 @@ } /* prepare next AIO request */ - acb->n = acb->nb_sectors; - acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n); + acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, + 0, 0, 0, 0); index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); + acb->n = s->cluster_sectors - index_in_cluster; + if (acb->n > acb->nb_sectors) + acb->n = acb->nb_sectors; if (!acb->cluster_offset) { if (bs->backing_hd) { @@ -1268,16 +869,12 @@ if (acb->hd_aiocb == NULL) goto fail; } else { - ret = qcow_schedule_bh(qcow_aio_read_bh, acb); - if (ret < 0) - goto fail; + goto redo; } } else { /* Note: in this case, no need to wait */ memset(acb->buf, 0, 512 * acb->n); - ret = qcow_schedule_bh(qcow_aio_read_bh, acb); - if (ret < 0) - goto fail; + goto redo; } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -1285,9 +882,7 @@ goto fail; memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); - ret = qcow_schedule_bh(qcow_aio_read_bh, acb); - if (ret < 0) - goto fail; + goto redo; } else { if ((acb->cluster_offset & 511) != 0) { ret = -EIO; @@ -1316,7 +911,6 @@ acb->nb_sectors = nb_sectors; acb->n = 0; acb->cluster_offset = 0; - acb->l2meta.nb_clusters = 0; return acb; } @@ -1340,8 +934,8 @@ BlockDriverState *bs = acb->common.bs; BDRVQcowState *s = bs->opaque; int index_in_cluster; + uint64_t cluster_offset; const uint8_t *src_buf; - int n_end; acb->hd_aiocb = NULL; @@ -1352,11 +946,6 @@ return; } - if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) { - free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters); - goto fail; - } - acb->nb_sectors -= acb->n; acb->sector_num += acb->n; acb->buf += acb->n * 512; @@ -1369,22 +958,19 @@ } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + acb->nb_sectors; - if (s->crypt_method && - n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) - n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; - - acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9, - index_in_cluster, - n_end, &acb->n, &acb->l2meta); - if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) { + acb->n = s->cluster_sectors - index_in_cluster; + if (acb->n > acb->nb_sectors) + acb->n = acb->nb_sectors; + cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + acb->n); + if (!cluster_offset || (cluster_offset & 511) != 0) { ret = -EIO; goto fail; } if (s->crypt_method) { if (!acb->cluster_data) { - acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * - s->cluster_size); + acb->cluster_data = qemu_mallocz(s->cluster_size); if (!acb->cluster_data) { ret = -ENOMEM; goto fail; @@ -1397,7 +983,7 @@ src_buf = acb->buf; } acb->hd_aiocb = bdrv_aio_write(s->hd, - (acb->cluster_offset >> 9) + index_in_cluster, + (cluster_offset >> 9) + index_in_cluster, src_buf, acb->n, qcow_aio_write_cb, acb); if (acb->hd_aiocb == NULL) @@ -1571,7 +1157,7 @@ memset(s->l1_table, 0, l1_length); if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0) - return -1; + return -1; ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length); if (ret < 0) return ret; @@ -1637,10 +1223,8 @@ /* could not compress: write normal cluster */ qcow_write(bs, sector_num, buf, s->cluster_sectors); } else { - cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9, - out_len); - if (!cluster_offset) - return -1; + cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, + out_len, 0, 0); cluster_offset &= s->cluster_offset_mask; if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) { qemu_free(out_buf); @@ -2225,19 +1809,26 @@ BDRVQcowState *s = bs->opaque; int i, nb_clusters; - nb_clusters = size_to_clusters(s, size); -retry: - for(i = 0; i < nb_clusters; i++) { - int64_t i = s->free_cluster_index++; - if (get_refcount(bs, i) != 0) - goto retry; - } + nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits; + for(;;) { + if (get_refcount(bs, s->free_cluster_index) == 0) { + s->free_cluster_index++; + for(i = 1; i < nb_clusters; i++) { + if (get_refcount(bs, s->free_cluster_index) != 0) + goto not_found; + s->free_cluster_index++; + } #ifdef DEBUG_ALLOC2 - printf("alloc_clusters: size=%lld -> %lld\n", - size, - (s->free_cluster_index - nb_clusters) << s->cluster_bits); + printf("alloc_clusters: size=%lld -> %lld\n", + size, + (s->free_cluster_index - nb_clusters) << s->cluster_bits); #endif - return (s->free_cluster_index - nb_clusters) << s->cluster_bits; + return (s->free_cluster_index - nb_clusters) << s->cluster_bits; + } else { + not_found: + s->free_cluster_index++; + } + } } static int64_t alloc_clusters(BlockDriverState *bs, int64_t size) @@ -2301,7 +1892,8 @@ int new_table_size, new_table_size2, refcount_table_clusters, i, ret; uint64_t *new_table; int64_t table_offset; - uint8_t data[12]; + uint64_t data64; + uint32_t data32; int old_table_size; int64_t old_table_offset; @@ -2340,10 +1932,13 @@ for(i = 0; i < s->refcount_table_size; i++) be64_to_cpus(&new_table[i]); - cpu_to_be64w((uint64_t*)data, table_offset); - cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters); + data64 = cpu_to_be64(table_offset); if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset), - data, sizeof(data)) != sizeof(data)) + &data64, sizeof(data64)) != sizeof(data64)) + goto fail; + data32 = cpu_to_be32(refcount_table_clusters); + if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_clusters), + &data32, sizeof(data32)) != sizeof(data32)) goto fail; qemu_free(s->refcount_table); old_table_offset = s->refcount_table_offset; @@ -2572,7 +2167,7 @@ uint16_t *refcount_table; size = bdrv_getlength(s->hd); - nb_clusters = size_to_clusters(s, size); + nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits; refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t)); /* header */ @@ -2624,7 +2219,7 @@ int refcount; size = bdrv_getlength(s->hd); - nb_clusters = size_to_clusters(s, size); + nb_clusters = (size + s->cluster_size - 1) >> s->cluster_bits; for(k = 0; k < nb_clusters;) { k1 = k; refcount = get_refcount(bs, k); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html