--- /dev/null 2008-04-02 16:29:12.813336657 +0200 +++ linux-2.6.24logfs/fs/logfs/readwrite.c 2008-04-02 00:01:54.545819992 +0200 @@ -0,0 +1,1618 @@ +/* + * fs/logfs/readwrite.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2007 Joern Engel <joern@xxxxxxxxx> + * + * + * Actually contains five sets of very similar functions: + * read read blocks from a file + * seek_hole find next hole + * seek_data find next data block + * valid check whether a block still belongs to a file + * write write blocks to a file + * delete delete a block (for directories and ifile) + * rewrite move existing blocks of a file to a new location (gc helper) + * truncate truncate a file + */ +#include "logfs.h" + +static int adjust_level(int level) +{ + if (level >= LOGFS_MAX_LEVELS) + level -= LOGFS_MAX_LEVELS; + WARN_ON(level >= LOGFS_MAX_LEVELS); + return level; +} + +static u64 adjust_bix(u64 bix, u8 level) +{ + switch (adjust_level(level)) { + case 0: + return bix; + case 1: + return max_t(u64, bix, I0_BLOCKS); + case 2: + return max_t(u64, bix, I1_BLOCKS); + case 3: + return max_t(u64, bix, I2_BLOCKS); + default: + WARN_ON(1); + return bix; + } +} + +/** + * The inode address space is cut in two halves. Lower half belongs to data + * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is + * set, the actual block index (bix) and level can be derived from the page + * index. + * + * The lowest three bits of the block index are set to 0 after packing and + * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored + * anyway this is harmless. + */ +#define ARCH_SHIFT (BITS_PER_LONG - 32) +#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) +#define LEVEL_SHIFT (28 + ARCH_SHIFT) +static pgoff_t logfs_pack_index(u64 bix, u8 level) +{ + pgoff_t index; + + BUG_ON(bix >= INDIRECT_BIT); + BUG_ON(level > 7); + if (level == 0) + return bix; + + index = INDIRECT_BIT; + index |= (long)level << LEVEL_SHIFT; + index |= bix >> (level*LOGFS_BLOCK_BITS); + return index; +} + +void logfs_unpack_index(pgoff_t index, u64 *bix, u8 *level) +{ + if (!(index & INDIRECT_BIT)) { + *bix = index; + *level = 0; + return; + } + + *level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; + *bix = (index << (*level*LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; + *bix = adjust_bix(*bix, *level); + return; +} +#undef ARCH_SHIFT +#undef INDIRECT_BIT +#undef LEVEL_SHIFT + +/** + * logfs_flush_dirty - flush dirty blocks + * @sb: filesystem superblock + * @sync: if 0, only flush enough to continue writing, + * if 1, completely flush list + */ +void logfs_flush_dirty(struct super_block *sb, int sync) +{ + struct logfs_super *super = logfs_super(sb); + u64 bytes = LOGFS_MAX_LEVELS * LOGFS_MAX_OBJECTSIZE; + struct logfs_block *block; + struct page *page; + struct inode *inode; + int ret; + + while (super->s_dirty_free_bytes || super->s_dirty_used_bytes) { + if (!sync && (super->s_free_bytes >= bytes + super->s_gc_reserve + + super->s_dirty_free_bytes)) + break; + + BUG_ON(list_empty(&super->s_dirty_list)); + block = list_entry(super->s_dirty_list.next, struct logfs_block, + dirty_list); + page = block->page; + inode = page->mapping->host; + ret = logfs_write_buf(inode, page, NULL, 0); + BUG_ON(ret); + /* We may need to GC some more after writing a page */ + logfs_gc_pass(sb); + } +} + +/* + * Logfs is prone to an AB-BA deadlock where one task tries to acquire + * s_w_mutex with a locked page and GC tries to get that page while holding + * s_w_mutex. + * To solve this issue logfs will ignore the page lock iff the page in question + * is waiting for s_w_mutex. We annotate this fact by setting PG_pre_locked + * in addition to PG_locked. + * + * FIXME: Logfs already uses PG_owner_priv_1 for other purposes and there is + * no PG_owner_priv_2. Currently we abuse the (hopefully) free flag 18. But + * that flag may get reused any minute. In fact, Christoph Lameter recently + * sent a patchset to reshuffle page flags. Highly dangerous. + */ +#define PG_pre_locked 18 +#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) +#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) +#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) +static void logfs_get_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + if (lock) { + struct logfs_super *super = logfs_super(sb); + + if (page) + SetPagePreLocked(page); + mutex_lock(&super->s_w_mutex); + super->s_write_page = page; + logfs_gc_pass(sb); + /* FIXME: We also have to check for shadowed space + * and mempool fill grade */ + logfs_flush_dirty(sb, 0); + } +} + +static void logfs_put_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + if (lock) { + logfs_super(sb)->s_write_page = NULL; + /* Order matters - we must clear PG_pre_locked before releasing + * s_w_mutex or we could race against another task. */ + if (page) + ClearPagePreLocked(page); + mutex_unlock(&logfs_super(sb)->s_w_mutex); + } +} + +static struct page *logfs_get_read_page(struct inode *inode, u64 bix, u8 level) +{ + return find_or_create_page(inode->i_mapping, + logfs_pack_index(bix, level), GFP_NOFS); +} + +static void logfs_put_read_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_page(struct inode *inode, u64 bix, u8 level) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index = logfs_pack_index(bix, level); + struct page *page; + int err; + int loop = 0; + +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + return NULL; + } + } else while (unlikely(TestSetPageLocked(page))) { + if (PagePreLocked(page)) { + /* Holder of page lock is waiting for us, it + * is safe to use this page. */ + return page; + } + if (loop++ > 0x1000) { + /* Has been observed once so far... */ + printk(KERN_ERR "stack at %p\n", &loop); + BUG(); + } + /* Some other process has this page locked and has + * nothing to do with us. Wait for it to finish. + */ + schedule(); + } + return page; +} + +static void logfs_put_page(struct inode *inode, struct page *page) +{ + if (likely(!PagePreLocked(page))) + unlock_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_write_page(struct inode *inode, u64 bix, u8 level) +{ + struct page *write_page = logfs_super(inode->i_sb)->s_write_page; + pgoff_t index = logfs_pack_index(bix, level); + + if (write_page && (inode->i_mapping == write_page->mapping) + && (index == write_page->index)) + return write_page; + else + return logfs_get_page(inode, bix, level); +} + +static void logfs_put_write_page(struct inode *inode, struct page *page) +{ + struct page *write_page = logfs_super(inode->i_sb)->s_write_page; + + if (page != write_page) + logfs_put_page(inode, page); +} + +static unsigned long __get_bits(u64 val, int skip, int no) +{ + u64 ret = val; + + ret >>= skip * no; + ret <<= 64 - no; + ret >>= 64 - no; + return ret; +} + +static unsigned long get_bits(u64 val, int skip) +{ + return __get_bits(val, skip, LOGFS_BLOCK_BITS); +} + +/* + * Returns: + * 0 if all pointers are NUL + * -1 if all pointers have LOGFS_FULLY_POPULATED set + * 1 if at least one pointer is non-NUL and + * at least one has LOGFS_FULLY_POPULATED cleared + */ +enum blockstate { + bs_fully_populated = -1, + bs_all_zero = 0, + bs_mixed = 1, +}; + +static int block_state(__be64 *block) +{ + int i; + + if (block[0]) { + for (i = 1; i < LOGFS_BLOCK_FACTOR; i++) + if (!(block[i] & cpu_to_be64(LOGFS_FULLY_POPULATED))) + return bs_mixed; + return bs_fully_populated; + } else { + for (i = 1; i < LOGFS_BLOCK_FACTOR; i++) + if (block[i]) + return bs_mixed; + return bs_all_zero; + } +} + +static int page_state(struct page *page) +{ + __be64 *block; + int ret; + + block = kmap_atomic(page, KM_USER0); + ret = block_state(block); + kunmap_atomic(block, KM_USER0); + return ret; +} + +static void alloc_block(struct page *page) +{ + struct logfs_super *super = logfs_super(page->mapping->host->i_sb); + struct logfs_block *block; + + if (PagePrivate(page)) + return; + + block = mempool_alloc(super->s_block_pool, GFP_KERNEL); + INIT_LIST_HEAD(&block->dirty_list); + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; +} + +static struct shadow_tree *logfs_page_to_tree(struct page *page) +{ + alloc_block(page); + return &logfs_block(page)->shadow_tree; +} + +static void block_set_pointer(struct page *page, int index, u64 ptr) +{ + __be64 *block; + + block = kmap_atomic(page, KM_USER0); + block[index] = cpu_to_be64(ptr); + flush_dcache_page(page); + kunmap_atomic(block, KM_USER0); + SetPageUptodate(page); +} + +static u64 block_get_pointer(struct page *page, int index) +{ + __be64 *block; + u64 ptr; + + block = kmap_atomic(page, KM_USER0); + ptr = be64_to_cpu(block[index]); + kunmap_atomic(block, KM_USER0); + return ptr; +} + +static int logfs_read_empty(struct page *page) +{ + zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + SetPageZero(page); + return 0; +} + +static int logfs_read_embedded(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + void *buf; + + buf = kmap_atomic(page, KM_USER0); + memcpy(buf, li->li_data, LOGFS_EMBEDDED_SIZE); + memset(buf + LOGFS_EMBEDDED_SIZE, 0, + PAGE_CACHE_SIZE - LOGFS_EMBEDDED_SIZE); + flush_dcache_page(page); + kunmap_atomic(buf, KM_USER0); + return 0; +} + +static int logfs_read_direct(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + u64 block; + + block = li->li_data[index]; + if (!block) + return logfs_read_empty(page); + + return logfs_segment_read(inode, page, block, index, 0); +} + +static int logfs_read_loop(struct inode *inode, struct page *page, int count) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[I1_INDEX + count]; + pgoff_t bix = page->index; + int level, ret; + struct page *ipage; + + if (!bofs) + return logfs_read_empty(page); + + for (level = count + 1; level > 0; level--) { + ipage = logfs_get_read_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, level-1)); + logfs_put_read_page(ipage); + if (!bofs) + return logfs_read_empty(page); + } + + return logfs_segment_read(inode, page, bofs, bix, 0); +} + +static int logfs_read_block(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + + if (li->li_flags & LOGFS_IF_EMBEDDED) { + if (index != 0) + return logfs_read_empty(page); + else + return logfs_read_embedded(page, inode); + } else if (index < I0_BLOCKS) + return logfs_read_direct(inode, page); + else if (index < I1_BLOCKS) + return logfs_read_loop(inode, page, 0); + else if (index < I2_BLOCKS) + return logfs_read_loop(inode, page, 1); + else if (index < I3_BLOCKS) + return logfs_read_loop(inode, page, 2); + + BUG(); + return -EIO; +} + +static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + + for (; bix < I0_BLOCKS; bix++) + if (data ^ (li->li_data[bix] == 0)) + return bix; + return I0_BLOCKS; +} + +static u64 seek_holedata_loop(struct inode *inode, u64 bix, int count, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + __be64 *rblock; + u64 bofs = li->li_data[I1_INDEX + count]; + int level, ret, slot; + struct page *page; + + BUG_ON(!bofs); + + for (level = count + 1; level > 0; level--) { + page = logfs_get_read_page(inode, bix, level); + if (!page) + return bix; + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_read_page(page); + return bix; + } + + slot = get_bits(bix, level-1); + rblock = kmap_atomic(page, KM_USER0); + while (slot < LOGFS_BLOCK_FACTOR) { + if (data && (rblock[slot] != 0)) + break; + if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) + break; + slot++; + bix += 1 << (LOGFS_BLOCK_BITS * (level-1)); + } + if (slot >= LOGFS_BLOCK_FACTOR) { + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + return bix; + } + bofs = be64_to_cpu(rblock[slot]); + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + if (!bofs) { + BUG_ON(data); + return bix; + } + } + return bix; +} + +/** + * logfs_seek_hole - find next hole starting at a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next hole. If the file doesn't contain any further holes, the + * block address next to eof is returned instead. + */ +u64 logfs_seek_hole(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (li->li_flags & LOGFS_IF_EMBEDDED) + return 1; + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 0); + if (bix < I0_BLOCKS) + return bix; + } + +#define SEEK_HOLE_LOOP_WRAPPER(index, blocks, count) do { \ + if (bix < blocks) { \ + if (!li->li_data[index]) \ + return bix; \ + else if (li->li_data[index] & LOGFS_FULLY_POPULATED) \ + bix = blocks; \ + else { \ + bix = seek_holedata_loop(inode, bix, count, 0); \ + if (bix < blocks) \ + return bix; \ + /* LOGFS_FULLY_POPULATED should have been set */\ + WARN_ON_ONCE(bix == blocks); \ + } \ + } \ +} while (0) + SEEK_HOLE_LOOP_WRAPPER(I1_INDEX, I1_BLOCKS, 0); + SEEK_HOLE_LOOP_WRAPPER(I2_INDEX, I2_BLOCKS, 1); + SEEK_HOLE_LOOP_WRAPPER(I3_INDEX, I3_BLOCKS, 2); +#undef SEEK_HOLE_LOOP_WRAPPER + + return bix; +} + +static u64 __logfs_seek_data(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (li->li_flags & LOGFS_IF_EMBEDDED) + return bix; + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 1); + if (bix < I0_BLOCKS) + return bix; + } + +#define SEEK_DATA_LOOP_WRAPPER(index, blocks, count) do { \ + if (bix < blocks) { \ + if (!li->li_data[index]) \ + bix = blocks; \ + else \ + return seek_holedata_loop(inode, bix, count, 1);\ + } \ +} while (0) + SEEK_DATA_LOOP_WRAPPER(I1_INDEX, I1_BLOCKS, 0); + SEEK_DATA_LOOP_WRAPPER(I2_INDEX, I2_BLOCKS, 1); + SEEK_DATA_LOOP_WRAPPER(I3_INDEX, I3_BLOCKS, 2); +#undef SEEK_DATA_LOOP_WRAPPER + + return bix; +} + +/** + * logfs_seek_data - find next data block after a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next data block. If the file doesn't contain any further data + * blocks, the last block in the file is returned instead. + */ +u64 logfs_seek_data(struct inode *inode, u64 bix) +{ + struct super_block *sb = inode->i_sb; + u64 ret, end; + + ret = __logfs_seek_data(inode, bix); + end = i_size_read(inode) >> sb->s_blocksize_bits; + if (ret >= end) + ret = max(bix, end); + return ret; +} + +static int logfs_is_valid_direct(struct logfs_inode *li, pgoff_t index, u64 ofs) +{ + return pure_ofs(li->li_data[index]) == ofs; +} + +static int logfs_is_valid_shadow(struct page *page, u64 ofs) +{ + return PagePrivate(page) && + btree_lookup(&logfs_page_to_tree(page)->old, ofs); +} + +static int __logfs_is_valid_loop(struct inode *inode, u64 bix, int count, + u64 ofs, u64 bofs) +{ + int level, ret; + struct page *page; + + for (level = count + 1; level > 0; level--) { + page = logfs_get_write_page(inode, bix, level); + BUG_ON(!page); + + if (logfs_is_valid_shadow(page, ofs)) { + logfs_put_write_page(inode, page); + return 1; + } + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_write_page(inode, page); + return 0; + } + + bofs = block_get_pointer(page, get_bits(bix, level-1)); + logfs_put_write_page(inode, page); + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + } + return 0; +} + +static int logfs_is_valid_loop(struct inode *inode, pgoff_t index, + int count, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[I1_INDEX + count]; + + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + + return __logfs_is_valid_loop(inode, index, count, ofs, bofs); +} + +static int __logfs_is_valid_block(struct inode *inode, pgoff_t index, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (btree_lookup(&li->li_shadow_tree.old, ofs)) { + /* block is still valid on medium */ + return 1; + } + + if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) + return 0; + + if (li->li_flags & LOGFS_IF_EMBEDDED) + return 0; + + if (index < I0_BLOCKS) + return logfs_is_valid_direct(li, index, ofs); + else if (index < I1_BLOCKS) + return logfs_is_valid_loop(inode, index, 0, ofs); + else if (index < I2_BLOCKS) + return logfs_is_valid_loop(inode, index, 1, ofs); + else if (index < I3_BLOCKS) + return logfs_is_valid_loop(inode, index, 2, ofs); + + BUG(); + return 0; +} + +/** + * logfs_is_valid_block - check whether this block is still valid + * + * @sb - superblock + * @ofs - block physical offset + * @ino - block inode number + * @bix - block index + * @level - block level + * + * Returns 0 if block is invalid, 1 if it is valid. + */ +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + u8 level) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + struct page *page; + int ret, cookie; + + /* Umount closes a segment with free blocks remaining. Those + * blocks are by definition invalid. */ + if (ino == -1) + return 0; + + LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); + + inode = logfs_iget(sb, ino, &cookie); + if (!inode) + return 0; + + ret = __logfs_is_valid_block(inode, bix, ofs); + logfs_iput(inode, cookie); + if (ret) + return ret; + + /* Block may sit in the shadow of a dirty ifile block, so check again + * in the ifile, with properly forged parameters */ + ret = __logfs_is_valid_block(super->s_master_inode, ino, ofs); + if (ret) + return ret; + + /* Another check - the leaf blocks are usually ignored */ + page = logfs_get_write_page(super->s_master_inode, ino, 0); + if (!page) + return 0; + ret = logfs_is_valid_shadow(page, ofs); + logfs_put_write_page(inode, page); + return ret; +} + +int logfs_readpage_nolock(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EIO; + + ret = logfs_read_block(inode, page); + + if (ret) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + flush_dcache_page(page); + + return ret; +} + +static int logfs_reserve_bytes(struct inode *inode, int bytes) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!bytes) + return 0; + + if (super->s_free_bytes < bytes + super->s_gc_reserve) + return -ENOSPC; + + return 0; +} + +/* + * Not strictly a reservation, but rather a check that we still have enough + * space to satisfy the write. + */ +static int logfs_reserve_blocks(struct inode *inode, int blocks) +{ + return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); +} + +static int logfs_write_inode_now(struct inode *inode, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (!(li->li_flags & LOGFS_IF_DIRTY)) + return 0; + + li->li_flags &= ~LOGFS_IF_DIRTY; + if (inode->i_ino == LOGFS_INO_MASTER) + return logfs_write_anchor(inode); + + return __logfs_write_inode(inode, flags); +} + +static int logfs_write_embedded(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + void *buf, *dst = li->li_data; + + buf = kmap_atomic(page, KM_USER0); + memcpy(dst, buf, i_size_read(inode)); + flush_dcache_page(page); + kunmap_atomic(buf, KM_USER0); + + li->li_flags |= LOGFS_IF_EMBEDDED | LOGFS_IF_DIRTY; + + return 0; +} + +struct write_control { + struct shadow_tree *shadow_tree; + u64 ofs; + long flags; +}; + +static int adj_level(u64 ino, int level) +{ + BUG_ON(level >= LOGFS_MAX_LEVELS); + + if (ino == LOGFS_INO_MASTER) { + /* ifile has seperate areas */ + level += LOGFS_MAX_LEVELS; + } + return level; +} + +static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, u8 level, + u64 old_ofs) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_shadow *shadow; + + shadow = mempool_alloc(super->s_shadow_pool, GFP_KERNEL); + shadow->ino = inode->i_ino; + shadow->bix = bix; + shadow->level = adj_level(inode->i_ino, level); + shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; + return shadow; +} + +static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + mempool_free(shadow, super->s_block_pool); +} + +static void shadow_tree_merge(struct shadow_tree *target, + struct shadow_tree *victim) +{ + btree_merge(&target->new, &victim->new); + btree_merge(&target->old, &victim->old); +} + +static void add_shadow_tree_to_page(struct page *page, + struct shadow_tree *shadow_tree) +{ + if (!shadow_tree) + return; + if ((shadow_tree->old.height == 0) && (shadow_tree->new.height == 0)) + return; + + shadow_tree_merge(logfs_page_to_tree(page), shadow_tree); +} + +static void fill_shadow_tree(struct shadow_tree *tree, struct page *page, + struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(page->mapping->host->i_sb); + + if (PagePrivate(page)) { + shadow_tree_merge(tree, logfs_page_to_tree(page)); + list_del(&logfs_block(page)->dirty_list); + ClearPagePrivate(page); + mempool_free(logfs_block(page), super->s_block_pool); + page->private = 0; + } + if (shadow->old_ofs) + btree_insert(&tree->old, shadow->old_ofs, shadow); + else + btree_insert(&tree->new, shadow->new_ofs, shadow); + + super->s_dirty_used_bytes += shadow->new_len; + super->s_dirty_free_bytes += shadow->old_len; +} + +/* + * File is too large for embedded data when called. Move data to first + * block and clear embedded area. + */ +static int logfs_move_embedded(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_shadow *shadow; + void *buf; + int err; + int i; + pgoff_t index = page->index; + + if (!(li->li_flags & LOGFS_IF_EMBEDDED)) + return 0; + + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + if (index == 0) { + /* No need to write the page twice */ + li->li_data[0] = 0; + } else { + page = logfs_get_read_page(inode, 0, 0); + if (!page) + return -ENOMEM; + + buf = kmap_atomic(page, KM_USER0); + memcpy(buf, li->li_data, LOGFS_EMBEDDED_SIZE); + flush_dcache_page(page); + kunmap_atomic(buf, KM_USER0); + + shadow = alloc_shadow(inode, 0, 0, 0); + err = logfs_segment_write(inode, page, shadow); + logfs_put_read_page(page); + if (err) { + free_shadow(inode, shadow); + return err; + } + fill_shadow_tree(&li->li_shadow_tree, page, shadow); + + li->li_data[0] = shadow->new_ofs | LOGFS_FULLY_POPULATED; + } + + li->li_flags &= ~LOGFS_IF_EMBEDDED; + li->li_flags |= LOGFS_IF_DIRTY; + for (i = 1; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + + return 0; +} + +static int logfs_write_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + u8 level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + if (wc->ofs == 0) + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + shadow = alloc_shadow(inode, bix, level, wc->ofs); + if (wc->flags & WF_WRITE) + err = logfs_segment_write(inode, page, shadow); + if (wc->flags & WF_DELETE) + logfs_segment_delete(inode, shadow); + if (err) { + free_shadow(inode, shadow); + return err; + } + + fill_shadow_tree(wc->shadow_tree, page, shadow); + wc->ofs = shadow->new_ofs; + if (wc->ofs && ((level == 0) || (page_state(page) == bs_fully_populated))) + wc->ofs |= LOGFS_FULLY_POPULATED; + return 0; +} + +static int logfs_write_direct(struct inode *inode, struct page *page, + long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[page->index], + .shadow_tree = &li->li_shadow_tree, + .flags = flags, + }; + int err; + + err = logfs_write_i0(inode, page, &wc); + if (err) + return err; + + li->li_data[page->index] = wc.ofs; + li->li_flags |= LOGFS_IF_DIRTY; + return 0; +} + +static void logfs_dirty_page(struct inode *inode, struct page *page, long flags) +{ + struct logfs_block *block; + struct logfs_super *super = logfs_super(inode->i_sb); + + /* The assertion below is nicer to debug than random corruption due + * to buggerhead being the default. */ + BUG_ON(!page_mapping(page)->a_ops->set_page_dirty); + + alloc_block(page); + block = logfs_block(page); + mark_inode_dirty(inode); + set_page_dirty(page); + if (flags & WF_GC) + logfs_dirty_for_gc(inode->i_sb, block); + else + list_move_tail(&block->dirty_list, &super->s_dirty_list); +} + +static int __logfs_write_rec(struct inode *inode, struct page *page, + struct write_control *this_wc, + pgoff_t bix, int target_level, int level) +{ + int ret; + struct page *ipage; + struct write_control child_wc = { + .flags = this_wc->flags, + }; + + ipage = logfs_get_write_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + if (this_wc->ofs) { + ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (ret) + goto out; + } else { + if (PageZero(ipage)) + ClearPageZero(ipage); + else if (!PageUptodate(ipage)) + zero_user_page(ipage, 0, PAGE_SIZE, KM_USER0); + } + child_wc.shadow_tree = logfs_page_to_tree(ipage); + child_wc.ofs = block_get_pointer(ipage, get_bits(bix, level-1)); + + if (level-1 > target_level) + ret = __logfs_write_rec(inode, page, &child_wc, bix, + target_level, level-1); + else + ret = logfs_write_i0(inode, page, &child_wc); + + if (ret) + goto out; + + /* TODO: both operations use kmap_atomic, combine them */ + block_set_pointer(ipage, get_bits(bix, level-1), child_wc.ofs); + if (child_wc.ofs || page_state(ipage) != bs_all_zero) + this_wc->flags |= WF_WRITE; + /* TODO: use write-back caching for ifile as well */ + /* the condition on this_wc->ofs ensures that we won't consume extra + * space for indirect blocks in the future, which we cannot reserve */ + if ((this_wc->flags & WF_SYNC) || !this_wc->ofs) + ret = logfs_write_i0(inode, ipage, this_wc); + else + logfs_dirty_page(inode, ipage, this_wc->flags); +out: + logfs_put_write_page(inode, ipage); + return ret; +} + +static int logfs_write_rec(struct inode *inode, struct page *page, + pgoff_t bix, int count, int target_level, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[I1_INDEX + count], + .shadow_tree = &li->li_shadow_tree, + .flags = flags, + }; + int ret; + + if (count+1 > target_level) + ret = __logfs_write_rec(inode, page, &wc, bix, target_level, + count+1); + else + ret = logfs_write_i0(inode, page, &wc); + if (!ret) { + if (li->li_data[I1_INDEX + count] != wc.ofs) { + li->li_flags |= LOGFS_IF_DIRTY; + li->li_data[I1_INDEX + count] = wc.ofs; + } + } + return ret; +} + +/* + * We are protected by write lock. Push victims up to superblock level + * and release transaction when appropriate. logfs_write_inode_now(inode) + * will then finish the transaction when writing the master inode to the + * journal. + */ +static void logfs_handle_transaction(struct inode *inode, + struct logfs_transaction *ta) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!ta) + return; + + if (inode->i_ino != LOGFS_INO_MASTER) { + /* just remember the transaction until inode is written */ + BUG_ON(logfs_inode(inode)->li_transaction); + logfs_inode(inode)->li_transaction = ta; + logfs_inode(inode)->li_flags |= LOGFS_IF_DIRTY; + return; + } + + switch (ta->state) { + case CREATE_1: /* fall through */ + case UNLINK_1: + BUG_ON(super->s_victim_ino); + super->s_victim_ino = ta->ino; + break; + case CREATE_2: /* fall through */ + case UNLINK_2: + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + /* transaction ends here - free it */ + kfree(ta); + break; + case CROSS_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + break; + case CROSS_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + kfree(ta); + break; + case TARGET_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + super->s_victim_ino = ta->ino; + break; + case TARGET_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + break; + case TARGET_RENAME_3: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + kfree(ta); + break; + default: + BUG(); + } +} + +static int __logfs_write_buf(struct inode *inode, struct page *page, + struct logfs_transaction *ta, long flags) +{ + u64 size = i_size_read(inode); + pgoff_t index = page->index; + int err; + u64 bix; + u8 level; + + flags |= WF_WRITE | WF_DELETE; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + logfs_handle_transaction(inode, ta); + + if (size <= LOGFS_EMBEDDED_SIZE) + return logfs_write_embedded(page, inode); + + err = logfs_move_embedded(inode, page); + if (err) + return err; + + if (index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + + logfs_unpack_index(index, &bix, &level); + bix = adjust_bix(bix, level); + if (bix < I1_BLOCKS) + return logfs_write_rec(inode, page, bix, 0, level, flags); + if (bix < I2_BLOCKS) + return logfs_write_rec(inode, page, bix, 1, level, flags); + if (bix < I3_BLOCKS) + return logfs_write_rec(inode, page, bix, 2, level, flags); + + BUG(); + return -EIO; +} + +int logfs_write_buf(struct inode *inode, struct page *page, + struct logfs_transaction *ta, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + + ret = __logfs_write_buf(inode, page, ta, flags); + BUG_ON(PagePrivate(page)); + if (!ret) + ret = logfs_write_inode_now(inode, flags & ~WF_LOCK); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int __logfs_delete(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + long flags = WF_DELETE | WF_SYNC; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + if (li->li_flags & LOGFS_IF_EMBEDDED) { + i_size_write(inode, 0); + li->li_flags |= LOGFS_IF_DIRTY; + return 0; + } + + if (page->index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + if (page->index < I1_BLOCKS) + return logfs_write_rec(inode, page, page->index, 0, 0, flags); + if (page->index < I2_BLOCKS) + return logfs_write_rec(inode, page, page->index, 1, 0, flags); + if (page->index < I3_BLOCKS) + return logfs_write_rec(inode, page, page->index, 2, 0, flags); + return 0; +} + +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree, struct logfs_transaction *ta) +{ + struct super_block *sb = inode->i_sb; + struct page *page; + int ret; + + page = logfs_get_read_page(inode, index, 0); + if (!page) + return -ENOMEM; + + add_shadow_tree_to_page(page, shadow_tree); + logfs_get_wblocks(sb, page, 1); + logfs_handle_transaction(inode, ta); + ret = __logfs_delete(inode, page); + if (!ret) + ret = logfs_write_inode_now(inode, WF_SYNC); + logfs_put_wblocks(sb, page, 1); + + SetPageZero(page); + logfs_put_read_page(page); + + return ret; +} + +/* Rewrite cannot mark the inode dirty but has to write it immediatly. */ +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, int level, + long flags) +{ + struct page *page; + int err; + + level = adjust_level(level); + page = logfs_get_write_page(inode, bix, level); + if (!page) + return -ENOMEM; + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (!err) + err = logfs_write_buf(inode, page, NULL, flags); + logfs_put_write_page(inode, page); + return err; +} + +#define truncate_page(page, offset, km_type) \ + zero_user_page(page, offset, PAGE_SIZE - offset, km_type); + +static int truncate_data_block(struct inode *inode, struct page *page, + u64 ofs, struct logfs_shadow *shadow) +{ + loff_t size = i_size_read(inode); + loff_t pageofs = page->index * LOGFS_BLOCKSIZE; + u64 bix; + u8 level; + int err; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level > 0); + if (size <= pageofs) + return 0; + + BUG_ON(size - pageofs >= PAGE_SIZE); + err = logfs_segment_read(inode, page, ofs, bix, level); + if (err) + return err; + truncate_page(page, size - pageofs, KM_USER0); + return logfs_segment_write(inode, page, shadow); +} + +static int __logfs_truncate_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + u8 level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + shadow = alloc_shadow(inode, bix, level, wc->ofs); + + if (level == 0) + err = truncate_data_block(inode, page, wc->ofs, shadow); + /* Indirect blocks can get removed completely */ + if (err) { + free_shadow(inode, shadow); + return err; + } + + logfs_segment_delete(inode, shadow); + fill_shadow_tree(wc->shadow_tree, page, shadow); + wc->ofs = shadow->new_ofs; + return 0; +} + +static int logfs_truncate_direct(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .shadow_tree = &li->li_shadow_tree, + }; + struct page *page; + int e; + int err; + + for (e = I1_INDEX - 1; e >= 0; e--) { + if (size > (e+1) * LOGFS_BLOCKSIZE) + break; + + wc.ofs = li->li_data[e]; + if (!wc.ofs) + continue; + + page = logfs_get_write_page(inode, e, 0); + if (!page) + return -ENOMEM; +#if 0 /* I believe this is unnecessary */ + err = logfs_segment_read(inode, page, wc.ofs, e, 0); + if (err) { + logfs_put_write_page(page); + return err; + } +#endif + err = __logfs_truncate_i0(inode, page, &wc); + logfs_put_write_page(inode, page); + if (err) + return err; + + li->li_data[e] = wc.ofs; + li->li_flags |= LOGFS_IF_DIRTY; + } + return 0; +} + +/* FIXME: these need to become per-sb once we support different blocksizes */ +static u64 logfs_factor[] = { + LOGFS_BLOCKSIZE, + LOGFS_I1_SIZE, + LOGFS_I2_SIZE, + LOGFS_I3_SIZE +}; + +static u64 logfs_foo[] = { + 1, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS, +}; + +static u64 logfs_start_index[] = { + I0_BLOCKS, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS +}; + +static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, u8 *level) +{ + logfs_unpack_index(index, bix, level); + if (*bix <= logfs_start_index[*level-1]) + *bix = 0; +} + +static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, + struct write_control *this_wc, u64 size) +{ + int truncate_happened = 0; + int e; + int err = 0; + u64 bix, child_bix; + u8 level; + struct page *page; + struct write_control child_wc = { + .shadow_tree = logfs_page_to_tree(ipage), + }; + + logfs_unpack_raw_index(ipage->index, &bix, &level); + err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (err) + return err; + + for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { + child_bix = bix + e*logfs_foo[level-1]; + if (size > (e+1) * logfs_factor[level-1]) { + if (truncate_happened) + BUG(); /* FIXME: Write out truncated block */ + return 0; + } + + child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); + if (!child_wc.ofs) + continue; + + truncate_happened = 1; + page = logfs_get_write_page(inode, child_bix, level-1); + if (!page) + return -ENOMEM; + + if (level > 1) + err = __logfs_truncate_rec(inode, page, &child_wc, size); + else + err = __logfs_truncate_i0(inode, page, &child_wc); + logfs_put_write_page(inode, page); + if (err) + return err; + + block_set_pointer(ipage, e, child_wc.ofs); + } + /* Complete block can get removed if we get here */ + return __logfs_truncate_i0(inode, ipage, this_wc); +} + +static int logfs_truncate_rec(struct inode *inode, u64 size, int level) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[I1_INDEX + level-1], + .shadow_tree = &li->li_shadow_tree, + }; + struct page *page; + int err; + + if (!wc.ofs) + return 0; + + page = logfs_get_write_page(inode, 0, level); + if (!page) + return -ENOMEM; + + err = __logfs_truncate_rec(inode, page, &wc, size); + logfs_put_write_page(inode, page); + if (err) + return err; + + if (li->li_data[I1_INDEX + level-1] != wc.ofs) { + li->li_data[I1_INDEX + level-1] = wc.ofs; + li->li_flags |= LOGFS_IF_DIRTY; + } + return 0; +} + +static int logfs_truncate_embedded(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + void *buf = (void *)li->li_data + size; + size_t len = LOGFS_EMBEDDED_SIZE - size; + + if (size < LOGFS_EMBEDDED_SIZE) + memset(buf, 0, len); + li->li_flags |= LOGFS_IF_DIRTY; + return 0; +} + +static int __logfs_truncate(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + int ret; + + if (li->li_flags & LOGFS_IF_EMBEDDED) + return logfs_truncate_embedded(inode, size); + + if (size >= logfs_factor[3]) + return 0; + ret = logfs_truncate_rec(inode, size, 3); + if (ret) + return ret; + + if (size >= logfs_factor[2]) + return 0; + ret = logfs_truncate_rec(inode, size, 2); + if (ret) + return ret; + + if (size >= logfs_factor[1]) + return 0; + ret = logfs_truncate_rec(inode, size, 1); + if (ret) + return ret; + + ret = logfs_truncate_direct(inode, size); + return ret; +} + +int logfs_truncate(struct inode *inode, u64 size) +{ + struct super_block *sb = inode->i_sb; + int err; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, size); + if (!err) + err = logfs_write_inode_now(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + + if (!err) + err = vmtruncate(inode, size); + + if (!err && size == 0) + logfs_inode(inode)->li_flags |= LOGFS_IF_EMBEDDED; + + return err; +} + +int logfs_inode_read(struct inode *inode, void *buf, size_t n, loff_t bix) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + struct page *page; + void *pagebuf; + + if (pos >= i_size_read(inode)) + return -EOF; + + page = read_cache_page(inode->i_mapping, bix, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + if (PageZero(page)) + return -ENODATA; + + pagebuf = kmap_atomic(page, KM_USER0); + memcpy(buf, pagebuf, n); + kunmap_atomic(pagebuf, KM_USER0); + return 0; +} + +/** + * logfs_inode_write - write inode or dentry objects + * + * @inode: parent inode (ifile or directory) + * @buf: object to write (inode or dentry) + * @n: object size + * @_pos: object number (file position in blocks/objects) + * @flags: write flags + * @lock: 0 if write lock is already taken, 1 otherwise + * @ta: transaction this write is part of or NULL + * @shadow_tree: shadow below this inode + */ +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct logfs_transaction *ta, + struct shadow_tree *shadow_tree) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + int err; + struct page *page; + void *pagebuf; + + BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); + BUG_ON(count > LOGFS_BLOCKSIZE); + page = logfs_get_read_page(inode, bix, 0); + if (!page) + return -ENOMEM; + + pagebuf = kmap_atomic(page, KM_USER0); + memcpy(pagebuf, buf, count); + memset(pagebuf+count, 0, LOGFS_BLOCKSIZE-count); + flush_dcache_page(page); + kunmap_atomic(pagebuf, KM_USER0); + ClearPageZero(page); + add_shadow_tree_to_page(page, shadow_tree); + + if (!(flags & WF_SYNC)) { + logfs_dirty_page(inode, page, flags); + logfs_put_read_page(page); + return 0; + } + /* + * Drop the page lock, but keep a reference on the page until + * logfs_write_buf returns. This allows GC to move this page while + * ensuring the page doesn't get assigned elsewhere under memory + * pressure. + */ + unlock_page(page); + + if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) + i_size_write(inode, pos + LOGFS_BLOCKSIZE); + + err = logfs_write_buf(inode, page, ta, flags); + page_cache_release(page); + if (err) + return err; + + return 0; +} + +int logfs_init_rw(struct logfs_super *super) +{ + int min_fill = 3 * super->s_no_blocks; + + mutex_init(&super->s_w_mutex); + super->s_block_pool = mempool_create_kzalloc_pool(min_fill, + sizeof(struct logfs_block)); + super->s_shadow_pool = mempool_create_kzalloc_pool(min_fill, + sizeof(struct logfs_shadow)); + return 0; +} + +void logfs_cleanup_rw(struct logfs_super *super) +{ + mempool_destroy(super->s_block_pool); + mempool_destroy(super->s_shadow_pool); +} -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html