Hello, Kent. On Wed, May 09, 2012 at 11:10:48PM -0400, Kent Overstreet wrote: > +#define BKEY_PADDED(key) \ > + union { struct bkey key; uint64_t key ## _pad[8]; } What does the '8' mean? And why does struct bbio use 3 instead? Does this have to be a macro? Can't we have struct bkey_padded (or whatever)? > +struct io { > + /* Used to track sequential IO so it can be skipped */ > + struct hlist_node hash; > + struct list_head lru; > + > + unsigned long jiffies; > + unsigned sequential; > + sector_t last; > +}; I don't think bcache can have struct io. :P > +struct dirty_io { > + struct closure cl; > + struct cached_dev *d; > + struct bio bio; > +}; > + > +struct dirty { > + struct rb_node node; > + BKEY_PADDED(key); > + struct dirty_io *io; > +}; ... > +struct cache { Nor these and so on. > +/* Bkey fields: all units are in sectors */ > + > +#define KEY_FIELD(name, field, offset, size) \ > + BITMASK(name, struct bkey, field, offset, size) > + > +#define PTR_FIELD(name, offset, size) \ > + static inline uint64_t name(const struct bkey *k, unsigned i) \ > + { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ > + \ > + static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ > + { \ > + k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ > + k->ptr[i] |= v << offset; \ > + } > + > +KEY_FIELD(KEY_PTRS, header, 60, 3) > +KEY_FIELD(HEADER_SIZE, header, 58, 2) > +KEY_FIELD(KEY_CSUM, header, 56, 2) > +KEY_FIELD(KEY_PINNED, header, 55, 1) > +KEY_FIELD(KEY_DIRTY, header, 36, 1) > + > +KEY_FIELD(KEY_SIZE, header, 20, 16) > +KEY_FIELD(KEY_DEV, header, 0, 20) > + > +KEY_FIELD(KEY_SECTOR, key, 16, 47) > +KEY_FIELD(KEY_SNAPSHOT, key, 0, 16) > + > +PTR_FIELD(PTR_DEV, 51, 12) > +PTR_FIELD(PTR_OFFSET, 8, 43) > +PTR_FIELD(PTR_GEN, 0, 8) So, apart from the the macros, key is 64bit containing the backend device and extent offset and size with the ptr fields somehow point to cache. Am I understanding it correctly? If right, I'm *tiny* bit apprehensive about using only 43bits for offset. While the block 9 bits means 52bit addressing, the 9 bit block size is now there mostly to work as buffer between memory bitness growth and storage device size growth so that we have 9 bit buffer as storage device reaches ulong addressing limit. Probably those days are far out enough. > +void btree_read_done(struct closure *cl) > +{ > + struct btree *b = container_of(cl, struct btree, io.cl); > + struct bset *i = b->sets[0].data; > + struct btree_iter *iter = b->c->fill_iter; > + const char *err = "bad btree header"; > + BUG_ON(b->nsets || b->written); > + > + bbio_free(b->bio, b->c); > + b->bio = NULL; > + > + mutex_lock(&b->c->fill_lock); > + iter->used = 0; > + > + if (btree_node_io_error(b) || > + !i->seq) > + goto err; > + > + for (; > + b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; > + i = write_block(b)) { > + err = "unsupported bset version"; > + if (i->version > BCACHE_BSET_VERSION) > + goto err; > + > + err = "bad btree header"; > + if (b->written + set_blocks(i, b->c) > btree_blocks(b)) > + goto err; > + > + err = "bad magic"; > + if (i->magic != bset_magic(b->c)) > + goto err; > + > + err = "bad checksum"; > + switch (i->version) { > + case 0: > + if (i->csum != csum_set(i)) > + goto err; > + break; > + case BCACHE_BSET_VERSION: > + if (i->csum != btree_csum_set(b, i)) > + goto err; > + break; > + } > + > + err = "empty set"; > + if (i != b->sets[0].data && !i->keys) > + goto err; > + > + btree_iter_push(iter, i->start, end(i)); > + > + b->written += set_blocks(i, b->c); > + } > + > + err = "corrupted btree"; > + for (i = write_block(b); > + index(i, b) < btree_blocks(b); > + i = ((void *) i) + block_bytes(b->c)) > + if (i->seq == b->sets[0].data->seq) > + goto err; > + > + btree_sort_and_fix_extents(b, iter); > + > + i = b->sets[0].data; > + err = "short btree key"; > + if (b->sets[0].size && > + bkey_cmp(&b->key, &b->sets[0].end) < 0) > + goto err; > + > + if (b->written < btree_blocks(b)) > + bset_init_next(b); > + > + if (0) { > +err: set_btree_node_io_error(b); > + cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys", > + err, PTR_BUCKET_NR(b->c, &b->key, 0), > + index(i, b), i->keys); > + } Please don't do that. Just define out: label, move error specific path to the end of the function and jump to out at the end of that. > + > + mutex_unlock(&b->c->fill_lock); > + > + spin_lock(&b->c->btree_read_time_lock); > + time_stats_update(&b->c->btree_read_time, b->io_start_time); > + spin_unlock(&b->c->btree_read_time_lock); > + > + smp_wmb(); /* read_done is our write lock */ > + set_btree_node_read_done(b); > + > + closure_return(cl); > +} > + > +static void btree_read_resubmit(struct closure *cl) > +{ > + struct btree *b = container_of(cl, struct btree, io.cl); > + > + submit_bbio_split(b->bio, b->c, &b->key, 0); > + continue_at(&b->io.cl, btree_read_done, system_wq); > +} I suppose submit_bbio_split() can't fail here somehow unlike from btree_read() path? If so, please add a comment to explain and maybe WARN_ON_ONCE() on failure. Subtlety to comment ratio is *way* off. > +static struct btree *mca_bucket_alloc(struct cache_set *c, > + struct bkey *k, gfp_t gfp) > +{ > + struct btree *b = kzalloc(sizeof(struct btree), gfp); > + if (!b) > + return NULL; > + > + init_rwsem(&b->lock); > + INIT_LIST_HEAD(&b->list); > + INIT_DELAYED_WORK(&b->work, btree_write_work); > + b->c = c; > + closure_init_unlocked(&b->io); > + > + mca_data_alloc(b, k, gfp); > + return b->sets[0].data ? b : NULL; > +} mca_data_alloc() failure path seems like a resource leak but it isn't because mca_data_alloc() puts it in free list. Is the extra level of caching necessary? How is it different from sl?b allocator cache? And either way, comments please. > +static int mca_reap(struct btree *b, struct closure *cl) > +{ > + lockdep_assert_held(&b->c->bucket_lock); > + > + if (!down_write_trylock(&b->lock)) > + return -1; > + > + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); > + > + if (cl && btree_node_dirty(b)) > + btree_write(b, true, NULL); > + > + if (cl) > + closure_wait_event_async(&b->io.wait, cl, > + atomic_read(&b->io.cl.remaining) == -1); > + > + if (btree_node_dirty(b) || > + atomic_read(&b->io.cl.remaining) != -1 || > + work_pending(&b->work.work)) { > + rw_unlock(true, b); > + return -EAGAIN; > + } > + > + return 0; > +} Mixing -1 and -EAGAIN returns usually isn't a good idea. > +static struct btree *alloc_bucket(struct cache_set *c, struct bkey *k, > + int level, struct closure *cl) > +{ > + struct btree *b, *i; > + unsigned page_order = ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); > + > + lockdep_assert_held(&c->bucket_lock); > +retry: > + if (find_bucket(c, k)) > + return NULL; > + > + /* btree_free() doesn't free memory; it sticks the node on the end of > + * the list. Check if there's any freed nodes there: > + */ > + list_for_each_entry(b, &c->btree_cache_freeable, list) > + if (page_order <= b->page_order && > + !b->key.ptr[0] && > + !mca_reap(b, NULL)) > + goto out; > + > + /* We never free struct btree itself, just the memory that holds the on > + * disk node. Check the freed list before allocating a new one: > + */ > + list_for_each_entry(b, &c->btree_cache_freed, list) > + if (!mca_reap(b, NULL)) { > + mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); > + if (!b->sets[0].data) { > + rw_unlock(true, b); > + goto err; > + } else > + goto out; > + } > + > + b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); > + if (!b) > + goto err; > + > + BUG_ON(!down_write_trylock(&b->lock)); > +out: > + BUG_ON(!closure_is_unlocked(&b->io.cl)); > + > + bkey_copy(&b->key, k); > + list_move(&b->list, &c->btree_cache); > + hlist_del_init_rcu(&b->hash); > + hlist_add_head_rcu(&b->hash, hash_bucket(c, k)); > + lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); > + > + b->flags = 0; > + b->level = level; > + b->written = 0; > + b->nsets = 0; > + for (int i = 0; i < MAX_BSETS; i++) > + b->sets[i].size = 0; > + for (int i = 1; i < MAX_BSETS; i++) > + b->sets[i].data = NULL; Why separate loops? > + > + return b; > +err: > + if (current->bio_list) > + return ERR_PTR(-EAGAIN); What does this test do? Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-bcache" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html