--- /dev/null 2008-03-30 12:15:48.586669308 +0200 +++ linux-2.6.24logfs/fs/logfs/gc.c 2008-04-01 19:48:36.066470483 +0200 @@ -0,0 +1,729 @@ +/* + * fs/logfs/gc.c - garbage collection code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2007 Joern Engel <joern@xxxxxxxxx> + * + * GC design as it should be (and isn't, as of 15.3.07): + * 1. Pick a good candidate for GC, constrained by the number of currently + * free segments. + * 2. Move all valid blocks in this segment until + * a) they are all gone, or + * b) the number of currently free segments drops too low. + * 3. Mark the segment as GC-pending or so, because not all indirect blocks + * have been written yet. + * 4. Either + * a) goto 1. or + * b) write dirty indirect blocks directly + * + * Sooner or later 4b) should be taken, causing a number of segments to be + * freed. 2b) will consume free segments until this point is reached. Overall + * progress will be made, even though less than a full segment may be gained. + * + * Crucial question is when to choose 4b) over 4a. + */ +#include "logfs.h" +#include <linux/sched.h> + +#define SCAN_RATIO 16 /* number of scanned segments per gc'd segment */ +#define LIST_SIZE 16 /* base size of candidate lists */ +#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ +#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ + +static void __logfs_gc_pass(struct super_block *sb, int target); + +/* journal has distance -1, top-most ifile layer distance 0 */ +static u8 root_distance(struct super_block *sb, u8 level) +{ + struct logfs_super *super = logfs_super(sb); + + switch (level) { + case 0: /* fall through */ + case 1: /* fall through */ + case 2: /* fall through */ + case 3: + /* file data or indirect blocks */ + return super->s_ifile_levels + super->s_iblock_levels - level; + case 6: /* fall through */ + case 7: /* fall through */ + case 8: /* fall through */ + case 9: + /* inode file data or indirect blocks */ + return super->s_ifile_levels - (level-6); + default: + printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", + level); + WARN_ON(1); + return super->s_ifile_levels + super->s_iblock_levels; + } +} + +int logfs_safe_to_write_block(struct super_block *sb, u8 level) +{ + struct logfs_super *super = logfs_super(sb); + + return root_distance(sb, level) <= super->s_free_list.count; +} + +static int segment_is_reserved(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area; + struct gc_candidate *cand; + void *reserved; + int i; + + /* Some segments are reserved. Just pretend they were all valid */ + reserved = btree_lookup(&super->s_reserved_segments, segno); + if (reserved) + return 1; + + /* Currently open segments */ + for_each_area(i) { + area = super->s_area[i]; + if (area->a_is_open && area->a_segno == segno) + return 1; + } + + /* On the free list */ + list_for_each_entry(cand, &super->s_free_list.list, list) + if (cand->segno == segno) + return 1; + + return 0; +} + +static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + BUG(); +} + +/* + * Count the bytes consumed by valid objects in this segment. Object headers + * are counted, the segment header is not. + */ +static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, + u8 *level, u64 *segment_gec) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, valid, size; + int err; + + err = device_read(sb, segno, 0, sizeof(sh), &sh); + BUG_ON(err); + if (!memchr_inv(&sh, 0xff, sizeof(sh))) + return 0; + + *level = sh.level; + *ec = be32_to_cpu(sh.ec); + *segment_gec = be64_to_cpu(sh.gec); + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + return super->s_segsize - 1; + } + + valid = 0; + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize;) { + err = device_read(sb, segno, seg_ofs, sizeof(oh), &oh); + BUG_ON(err); + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + return super->s_segsize - 1; + } + + ofs = dev_ofs(sb, segno, seg_ofs); + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + size = (u32)be16_to_cpu(oh.len) + sizeof(oh); + + if (logfs_is_valid_block(sb, ofs, ino, bix, *level)) + valid += size; + seg_ofs += size; + } + pr_debug(KERN_INFO "LOGFS valid(%x) = %x\n", segno, valid); + return valid; +} + +/* FIXME: check for off-by-one on max_dist */ +/* + * GC distance N: + * while (valid blocks in segment) { + * rewrite one block + * if (fewer than N free segments) + * GC distance N-1 + * } + * While (N > 1) { + * N--; + * flush dirty list N + * } + * Exit criterium: free segments >= N + * + * flush dirty list N: + * while (dirty blocks on level) { + * write block + * if (fewer then N free segments) + * GC distance N-1 + * } + */ + +/* + * Move all block from the gc_dirty lists to a private one, then recursively + * call into GC again to free some segments on higher layers. + */ +static void recursive_backoff(struct super_block *sb, int max_dist) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block, *tmp; + int i; + LIST_HEAD(backoff_list); + + for_each_area(i) { + if (i > max_dist) + break; + list_for_each_entry_safe(block, tmp, &super->s_gc_dirty_list[i], dirty_list) { + list_move_tail(&block->dirty_list, &backoff_list); + } + } + __logfs_gc_pass(sb, max_dist); + list_for_each_entry_safe(block, tmp, &backoff_list, dirty_list) + logfs_dirty_for_gc(sb, block); +} + +/* Write back any indirect/inode blocks dirtied during the GC run. */ +static void flush_gc_dirty_list(struct super_block *sb, int dist) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + struct page *page; + struct inode *inode; + long flags; + int err; + + for (;;) { + if (dist < 0) + break; + if (list_empty(&super->s_gc_dirty_list[dist])) { + dist--; + continue; + } + + block = list_entry(super->s_gc_dirty_list[dist].next, + struct logfs_block, dirty_list); + page = block->page; + inode = page->mapping->host; + if (super->s_free_list.count < dist) { + /* We ran out of room on this level. */ + /* Interestingly enough, this case is possible in + * theory, but never triggered in practice. + */ + recursive_backoff(sb, dist); + continue; + } + + flags = WF_GC; + if (super->s_free_bytes < dist * LOGFS_MAX_OBJECTSIZE + + super->s_gc_reserve + + super->s_dirty_free_bytes) + flags |= WF_SYNC; + err = logfs_write_buf(inode, page, NULL, WF_GC); + BUG_ON(err); + } +} + +void logfs_dirty_for_gc(struct super_block *sb, struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + u64 bix; + u8 level, dist; + + page = block->page; + inode = page->mapping->host; + logfs_unpack_index(block->page->index, &bix, &level); + if (inode->i_ino == LOGFS_INO_MASTER) + level += LOGFS_MAX_LEVELS; + + dist = root_distance(sb, level); + list_move_tail(&block->dirty_list, &logfs_super(sb)->s_gc_dirty_list[dist]); +} + +static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, + u64 bix, int level, long flags) +{ + struct inode *inode; + int err, cookie; + + inode = logfs_iget(sb, ino, &cookie); + BUG_ON(!inode); + err = logfs_rewrite_block(inode, bix, ofs, level, flags); + BUG_ON(err); + logfs_iput(inode, cookie); +} + +static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, cleaned = 0; + int level, err, len, valid; + long flags; + + LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); + + btree_insert(&super->s_reserved_segments, segno, (void *)1); + err = device_read(sb, segno, 0, sizeof(sh), &sh); + BUG_ON(err); + level = sh.level; + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = -1; + goto out; + } + + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize; ) { + ofs = dev_ofs(sb, segno, seg_ofs); + err = device_read(sb, segno, seg_ofs, sizeof(oh), &oh); + BUG_ON(err); + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = super->s_segsize - 1; + goto out; + } + + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + len = sizeof(oh) + be16_to_cpu(oh.len); + valid = logfs_is_valid_block(sb, ofs, ino, bix, level); + if (valid) { + /* Garbage collection may consume further free segments. + * If space gets too tight, abort and continue with a + * higher level segment for the moment. */ + if (super->s_free_list.count < dist) + recursive_backoff(sb, dist); + + flags = WF_GC; + if (super->s_free_bytes < dist * LOGFS_MAX_OBJECTSIZE + + super->s_gc_reserve + + super->s_dirty_free_bytes) + flags |= WF_SYNC; + logfs_cleanse_block(sb, ofs, ino, bix, level, flags); + cleaned += len; + } + seg_ofs += len; + } + + flush_gc_dirty_list(sb, dist - 1); +out: + btree_remove(&super->s_reserved_segments, segno); + return cleaned; +} + +static struct gc_candidate *add_list(struct gc_candidate *cand, + struct candidate_list *list) +{ + struct gc_candidate *cur, *removed = NULL; + int cont; + + /* insert sorted */ + list_for_each_entry(cur, &list->list, list) { + if (list->sort_by_ec) + cont = cur->erase_count < cand->erase_count; + else + cont = cur->valid < cand->valid; + if (cont) + continue; + list_add(&cand->list, &cur->list); + cand = NULL; + break; + } + /* if list is empty or candidate is worse than entire list */ + if (cand) + list_add_tail(&cand->list, &list->list); + + /* remove worst entry if list is full */ + if (list->count >= list->maxcount) { + removed = list_entry(list->list.prev, + struct gc_candidate, list); + list_del(&removed->list); + } else + list->count++; + + return removed; +} + +struct gc_candidate *get_best_cand(struct candidate_list *list) +{ + struct gc_candidate *cand; + + if (list->count == 0) + return NULL; + + cand = list_entry(list->list.next, struct gc_candidate, list); + list_del(&cand->list); + list->count--; + return cand; +} + +/* + * We try to stuff the candidate in several lists in order. Any list may + * either reject it or evict another candidate when full. Anything left after + * trying the last list gets freed. + */ +static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; + + /* 100% free segments */ + if (cand->valid == 0) + cand = add_list(cand, &super->s_free_list); + /* good candidates for Garbage Collection */ + if (cand && cand->valid < full) + cand = add_list(cand, &super->s_low_list[cand->dist]); + /* good candidates for wear leveling, + * segments that were recently written get ignored */ + if (cand && cand->gec < super->s_gec + 2*super->s_no_segs) + cand = add_list(cand, &super->s_ec_list); + + kfree(cand); +} + +static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, + u8 dist, u64 segment_gec) +{ + struct gc_candidate *cand; + + cand = kmalloc(sizeof(*cand), GFP_KERNEL); + if (!cand) + return -ENOMEM; + + cand->segno = segno; + cand->valid = valid; + cand->erase_count = ec; + cand->dist = dist; + cand->gec = segment_gec; + + /* FIXME: add to btree */ + __add_candidate(sb, cand); + return 0; +} + +int add_free_segments_from_journal(struct super_block *sb, + struct logfs_je_free_segments *segs, int count) +{ + int i, err; + + for (i = 0; i < count; i++) { + u32 segno = be32_to_cpu(segs[i].segno); + u32 ec = be32_to_cpu(segs[i].ec); + err = add_candidate(sb, segno, 0, ec, 0, 0); + if (err) + return err; + } + return 0; +} + +static void __del_segment(struct candidate_list *list, u32 segno) +{ + struct gc_candidate *cand; + + list_for_each_entry(cand, &list->list, list) + if (cand->segno == segno) { + list_del(&cand->list); + list->count -= 1; + kfree(cand); + return; + } +} + +static void del_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + __del_segment(&super->s_free_list, segno); + for_each_area(i) + __del_segment(&super->s_low_list[i], segno); + __del_segment(&super->s_ec_list, segno); +} + +static void scan_segment(struct super_block *sb, u32 segno) +{ + u64 segment_gec = 0; + u32 valid, ec = 0; + u8 dist, level = 0; + + if (segment_is_reserved(sb, segno)) + return; + + del_segment(sb, segno); + valid = logfs_valid_bytes(sb, segno, &ec, &level, &segment_gec); + dist = root_distance(sb, level); + add_candidate(sb, segno, valid, ec, dist, segment_gec); +} + +static struct gc_candidate *first_in_list(struct candidate_list *list) +{ + if (list->count == 0) + return NULL; + return list_entry(list->list.next, struct gc_candidate, list); +} + +static struct gc_candidate *get_wl_candidate(struct super_block *sb, + int max_dist) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u64 cand_gec; + + if (super->s_gec & 0xf) + return NULL; + + cand = first_in_list(&super->s_ec_list); + if (!cand) + return NULL; + if (cand->dist > max_dist) + return NULL; + + /* instead of comparing candidate erasecount to average erasecount, + * which would involve a 64bit division we multiply candidate erasecount + * with the number of segment.. In effect, the comparison is: + * if (cand->erase_count + 20 > average_erasecount) + */ + cand_gec = cand->erase_count; + cand_gec += 20; /* FIXME: should be a superblock variable */ + cand_gec *= super->s_no_segs; + if (cand_gec > super->s_gec) + return NULL; + + list_del(&cand->list); + super->s_ec_list.count--; + return cand; +} + +static struct gc_candidate *get_candidate(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i, max_dist; + struct gc_candidate *cand = NULL, *this; + + max_dist = min(super->s_free_list.count, LOGFS_NO_AREAS); + + cand = get_wl_candidate(sb, max_dist); + if (cand) + return cand; + + for (i = 0; i <= max_dist; i++) { + this = first_in_list(&super->s_low_list[i]); + if (!this) + continue; + if (!cand) + cand = this; + if (this->valid < cand->valid) + cand = this; + } + if (cand) { + list_del(&cand->list); + super->s_low_list[cand->dist].count--; + } + return cand; +} + +static int logfs_gc_once(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u64 segment_gec; + u32 cleaned, valid, segno, ec; + u8 dist; + + cand = get_candidate(sb); + if (!cand) + return 0; + + valid = cand->valid; + segno = cand->segno; + dist = cand->dist; + ec = cand->erase_count; + segment_gec = cand->gec; + kfree(cand); + pr_debug("GC segment #%02x at %x, %x required, %x free, %x valid, %llx free, %llx reserve\n", + segno, segno << super->s_segshift, + dist, super->s_free_list.count, valid, + super->s_free_bytes, super->s_gc_reserve); + cleaned = logfs_gc_segment(sb, segno, dist); + pr_debug("GC segment #%02x complete\n", segno); + add_candidate(sb, segno, valid - cleaned, ec, dist, segment_gec); + return 1; +} + +/* returns 1 if a wrap occurs, 0 otherwise */ +static int logfs_scan_some(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno; + int i, ret = 0; + + segno = super->s_sweeper; + for (i = SCAN_RATIO; i > 0; i--) { + segno++; + if (segno >= super->s_no_segs) { + segno = 0; + ret = 1; + } + + scan_segment(sb, segno); + } + super->s_sweeper = segno; + return ret; +} + +/* + * In principle, this function should loop forever, looking for GC candidates + * and moving data. LogFS is designed in such a way that this loop is + * guaranteed to terminate. + * + * Limiting the loop to some iterations serves purely to catch cases when + * these guarantees have failed. An actual endless loop is an obvious bug + * and should be reported as such. + */ +static void __logfs_gc_pass(struct super_block *sb, int target) +{ + struct logfs_super *super = logfs_super(sb); + int round, progress, last_progress = 0; + + pr_debug("__logfs_gc_pass(%x)\n", target); + for (round = 0; round < SCAN_ROUNDS; ) { + if (super->s_free_list.count >= target) + return; + round += logfs_scan_some(sb); + if (super->s_free_list.count >= target) + return; + progress = logfs_gc_once(sb); + if (progress) + last_progress = round; + else if (round - last_progress > 2) + break; + } + LOGFS_BUG(sb); +} + +void logfs_gc_pass(struct super_block *sb) +{ + __logfs_gc_pass(sb, logfs_super(sb)->s_total_levels); +} + +static int check_area(struct super_block *sb, int i) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[i]; + struct logfs_object_header h; + u32 segno = area->a_segno; + u32 ofs = area->a_used_bytes; + __be32 crc; + int err; + + if (!area->a_is_open) + return 0; + + for (ofs = area->a_used_bytes; + ofs <= super->s_segsize - sizeof(h); + ofs += (u32)be16_to_cpu(h.len) + sizeof(h)) { + err = device_read(sb, segno, ofs, sizeof(h), &h); + if (err) + return err; + + if (!memchr_inv(&h, 0xff, sizeof(h))) + break; + + crc = logfs_crc32(&h, sizeof(h) - 4, 4); + if (crc != h.crc) { + printk(KERN_INFO "interrupted header at %llx\n", + dev_ofs(sb, segno, ofs)); + return 0; + } + } + if (ofs > super->s_segsize - LOGFS_MAX_OBJECTSIZE) { + printk(KERN_INFO "%x bytes unaccounted data found at %llx - closing it\n", + ofs - area->a_used_bytes, + dev_ofs(sb, segno, ofs)); + area->a_segno = 0; + area->a_is_open = 0; + } else if (ofs != area->a_used_bytes) { + printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", + ofs - area->a_used_bytes, + dev_ofs(sb, segno, ofs)); + area->a_used_bytes = ofs; + } + return 0; +} + +int logfs_check_areas(struct super_block *sb) +{ + int i, err; + + for_each_area(i) { + err = check_area(sb, i); + if (err) + return err; + } + return 0; +} + +static void logfs_init_candlist(struct candidate_list *list, int maxcount, + int sort_by_ec) +{ + list->count = 0; + list->maxcount = maxcount; + list->sort_by_ec = sort_by_ec; + INIT_LIST_HEAD(&list->list); +} + +int logfs_init_gc(struct logfs_super *super) +{ + int i; + + logfs_init_candlist(&super->s_free_list, 4*LIST_SIZE, 1); + for_each_area(i) + logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); + logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); + return 0; +} + +static void logfs_cleanup_list(struct candidate_list *list) +{ + struct gc_candidate *cand, *next; + + list_for_each_entry_safe(cand, next, &list->list, list) { + list_del(&cand->list); + kfree(cand); + } +} + +void logfs_cleanup_gc(struct logfs_super *super) +{ + int i; + + if (!super->s_free_list.list.next) + return; + + logfs_cleanup_list(&super->s_free_list); + for_each_area(i) + logfs_cleanup_list(&super->s_low_list[i]); + logfs_cleanup_list(&super->s_ec_list); +} -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html