On Tue, Oct 29, 2024 at 01:45:01PM -0700, Catherine Hoang wrote: > This patch removes the use of buffer heads from the quota read and > write paths. To do so, we implement a new buffer cache using an > rhashtable. Each buffer stores data from an associated block, and > can be read or written to as needed. > > Ultimately, we want to completely remove buffer heads from ext2. > This patch serves as an example than can be applied to other parts > of the filesystem. > > Signed-off-by: Catherine Hoang <catherine.hoang@xxxxxxxxxx> Cool, thanks for sending this out publicly. :) > --- > fs/ext2/Makefile | 2 +- > fs/ext2/cache.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++ > fs/ext2/ext2.h | 30 ++++++++ > fs/ext2/inode.c | 20 +++++ > fs/ext2/super.c | 62 ++++++++------- > 5 files changed, 281 insertions(+), 28 deletions(-) > create mode 100644 fs/ext2/cache.c > > diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile > index 8860948ef9ca..e8b38243058f 100644 > --- a/fs/ext2/Makefile > +++ b/fs/ext2/Makefile > @@ -5,7 +5,7 @@ > > obj-$(CONFIG_EXT2_FS) += ext2.o > > -ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ > +ext2-y := balloc.o cache.o dir.o file.o ialloc.o inode.o \ > ioctl.o namei.o super.o symlink.o trace.o > > # For tracepoints to include our trace.h from tracepoint infrastructure > diff --git a/fs/ext2/cache.c b/fs/ext2/cache.c > new file mode 100644 > index 000000000000..c58416392c52 > --- /dev/null > +++ b/fs/ext2/cache.c > @@ -0,0 +1,195 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (c) 2024 Oracle. All rights reserved. > + */ > + > +#include "ext2.h" > +#include <linux/bio.h> > +#include <linux/blkdev.h> > +#include <linux/rhashtable.h> > +#include <linux/mm.h> > +#include <linux/types.h> > + > +static const struct rhashtable_params buffer_cache_params = { > + .key_len = sizeof(sector_t), > + .key_offset = offsetof(struct ext2_buffer, b_block), > + .head_offset = offsetof(struct ext2_buffer, b_rhash), > + .automatic_shrinking = true, > +}; > + > +static struct ext2_buffer *insert_buffer_cache(struct super_block *sb, struct ext2_buffer *new_buf) > +{ > + struct ext2_sb_info *sbi = EXT2_SB(sb); > + struct rhashtable *buffer_cache = &sbi->buffer_cache; > + struct ext2_buffer *old_buf; > + > + spin_lock(&sbi->buffer_cache_lock); > + old_buf = rhashtable_lookup_get_insert_fast(buffer_cache, > + &new_buf->b_rhash, buffer_cache_params); > + spin_unlock(&sbi->buffer_cache_lock); > + > + if (old_buf) > + return old_buf; > + > + return new_buf; Doesn't rhashtable_lookup_get_insert_fast return whichever buffer is in the rhashtable? So you can just do "return old_buf" here? > +} > + > +static void buf_write_end_io(struct bio *bio) > +{ > + struct ext2_buffer *buf = bio->bi_private; > + > + clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags); If the bio fails, should we leave the dirty bit set? The IO error could be temporary, and a future syncfs ought to retry the write. Also I wonder if we ought to have a way to report to sync_buffers that one of the writes failed so that it can return EIO for syncfs? Perhaps an atomic_t write failure counter in the buffer cache struct (see below) that we could bump from buf_write_end_io every time there's a failure? Then sync_buffers could compare the write failure counter before issuing IOs and after they've all completed? > + bio_put(bio); > +} > + > +static int submit_buffer_read(struct super_block *sb, struct ext2_buffer *buf) > +{ > + struct bio_vec bio_vec; > + struct bio bio; > + sector_t sector = buf->b_block * (sb->s_blocksize >> 9); > + > + bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); > + bio.bi_iter.bi_sector = sector; > + > + __bio_add_page(&bio, buf->b_page, buf->b_size, 0); > + > + return submit_bio_wait(&bio); > +} > + > +static void submit_buffer_write(struct super_block *sb, struct ext2_buffer *buf) > +{ > + struct bio *bio; > + sector_t sector = buf->b_block * (sb->s_blocksize >> 9); > + > + bio = bio_alloc(sb->s_bdev, 1, REQ_OP_WRITE, GFP_KERNEL); > + > + bio->bi_iter.bi_sector = sector; > + bio->bi_end_io = buf_write_end_io; > + bio->bi_private = buf; > + > + __bio_add_page(bio, buf->b_page, buf->b_size, 0); > + > + submit_bio(bio); > +} > + > +int sync_buffers(struct super_block *sb) > +{ > + struct ext2_sb_info *sbi = EXT2_SB(sb); > + struct rhashtable *buffer_cache = &sbi->buffer_cache; > + struct rhashtable_iter iter; > + struct ext2_buffer *buf; > + struct blk_plug plug; > + > + blk_start_plug(&plug); > + rhashtable_walk_enter(buffer_cache, &iter); > + rhashtable_walk_start(&iter); > + while ((buf = rhashtable_walk_next(&iter)) != NULL) { > + if (IS_ERR(buf)) > + continue; > + if (test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) { > + submit_buffer_write(sb, buf); > + } > + } > + rhashtable_walk_stop(&iter); > + rhashtable_walk_exit(&iter); > + blk_finish_plug(&plug); Hum. I think this needs to wait for the writes to complete so that it can report any IO errors. What do you think of adding a completion to ext2_buffer so that the end_io function can complete() on it, and this function can wait for each buffer? (I think this is getting closer and closer to the delwri list code for xfs_buf...) > + > + return 0; > +} > + > +static struct ext2_buffer *lookup_buffer_cache(struct super_block *sb, sector_t block) > +{ > + struct ext2_sb_info *sbi = EXT2_SB(sb); > + struct rhashtable *buffer_cache = &sbi->buffer_cache; > + struct ext2_buffer *found = NULL; > + > + found = rhashtable_lookup_fast(buffer_cache, &block, buffer_cache_params); > + > + return found; Doesn't this need to take the rcu read lock before the lookup so that the rhashtable structures (and the ext2_buffer) cannot be freed while we try to grab a refcount on the buffer? I think it also needs to refcount_inc_not_zero before dropping that read lock: rcu_read_lock(); found = rhashtable_lookup_fast(..); if (!found || !refcount_inc_not_zero(&found->b_refcount)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); return found; > +} > + > +static int init_buffer(struct super_block *sb, sector_t block, struct ext2_buffer **buf_ptr) > +{ > + struct ext2_buffer *buf; > + > + buf = kmalloc(sizeof(struct ext2_buffer), GFP_KERNEL); > + if (!buf) > + return -ENOMEM; > + > + buf->b_block = block; > + buf->b_size = sb->s_blocksize; > + buf->b_flags = 0; > + > + mutex_init(&buf->b_lock); > + refcount_set(&buf->b_refcount, 1); > + > + buf->b_page = alloc_page(GFP_KERNEL); For s_blocksize < PAGE_SIZE filesystems, you could make this more memory efficient by doing slab allocations instead of a full page. > + if (!buf->b_page) > + return -ENOMEM; Needs to free buf. > + > + buf->b_data = page_address(buf->b_page); > + > + *buf_ptr = buf; > + > + return 0; > +} > + > +void put_buffer(struct ext2_buffer *buf) > +{ > + refcount_dec(&buf->b_refcount); > + mutex_unlock(&buf->b_lock); Buffers can get freed after the refcount drops, so you need to drop the mutex before dropping the ref. I almost wonder if this function should take a (struct ext2_buffer **) so that you can null out the caller's pointer to avoid accidental UAF, but that might be too training wheels for the kernel. > +} > + > +struct ext2_buffer *get_buffer(struct super_block *sb, sector_t block, bool need_uptodate) > +{ > + int err; > + struct ext2_buffer *buf; > + struct ext2_buffer *new_buf; > + > + buf = lookup_buffer_cache(sb, block); > + > + if (!buf) { > + err = init_buffer(sb, block, &new_buf); If you made init_buffer return either a (struct ext2_buffer *) or NULL, then you could remove the third parameter and the callsite becomes: new_buf = init_buffer(sb, block); if (!new_buf) return ERR_PTR(-ENOMEM); > + if (err) > + return ERR_PTR(err); > + > + if (need_uptodate) { > + err = submit_buffer_read(sb, new_buf); > + if (err) > + return ERR_PTR(err); If you passed need_uptodate to init_buffer, then you could pass GFP_ZERO to alloc_page() in the !need_uptodate case, and the buffer would be clean (i.e. not contain any stale memory contents) if we're initializing a new block and don't care what's on disk. Also I think you need a destroy_buffer call if the read fails. > + } > + > + buf = insert_buffer_cache(sb, new_buf); > + if (IS_ERR(buf)) > + kfree(new_buf); I'm confused here -- insert_buffer_cache returns either a buffer that has been added to the cache since the lookup_buffer_cache call, or it adds new_buf and returns it. Neither of those pointers are an IS_ERR, so if buf != new_buf (aka we lost the race to add the buffer), we'll end up leaking new_buf. Also, new_buf has resources allocated to it, don't you need to call destroy_buffer here? > + } > + > + mutex_lock(&buf->b_lock); > + refcount_inc(&buf->b_refcount); > + > + return buf; > +} Some people dislike trivial helpers, but I think the callsites would read better if you did: struct ext2_buffer * __get_buffer(struct ext2_buffer_cache *bufs, sector_t block, bool need_uptodate); static inline struct ext2_buffer * get_buffer(struct ext2_buffer_cache *bufs, sector_t bno) { return __get_buffer(bufs, bno, false); } static inline struct ext2_buffer * read_buffer(struct ext2_buffer_cache *bufs, sector_t bno) { return __get_buffer(bufs, bno, true); } It's much easier for me to distinguish between reading a buffer so that we can update its contents vs. getting a buffer so that we can initialize a new block if the words are right there in the function name instead of a 0/1 argument: if (offset || tocopy != EXT2_BLOCK_SIZE(sb)) buf = get_buffer(sb, bno, 1); else buf = get_buffer(sb, bno, 0); vs: /* * rmw a partial buffer or skip the read if we're doing the * entire block. */ if (offset || tocopy != EXT2_BLOCK_SIZE(sb)) buf = read_buffer(sb, bno); else buf = get_buffer(sb, bno); > +void buffer_set_dirty(struct ext2_buffer *buf) > +{ > + set_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags); > +} There ought to be a way to clear the dirty bit on a buffer if you're freeing it... though AFAICT we never actually free blocks from a quota file so I guess you don't need it yet. > + > +int init_buffer_cache(struct rhashtable *buffer_cache) > +{ > + return rhashtable_init(buffer_cache, &buffer_cache_params); Shouldn't this be in charge of initializing the spinlock? > +} > + > +static void destroy_buffer(void *ptr, void *arg) > +{ > + struct ext2_buffer *buf = ptr; > + > + WARN_ON(test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)); > + __free_page(buf->b_page); > + kfree(buf); > +} > + > +void destroy_buffer_cache(struct rhashtable *buffer_cache) > +{ > + rhashtable_free_and_destroy(buffer_cache, destroy_buffer, NULL); > +} Just as a note to everyone else: there needs to be a shrinker to clear out !dirty buffers during memory reclaim, and (most probably) a means to initiate a background writeout of dirty buffers when memory starts getting low. Each of those is complex enough to warrant separate patches. > diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h > index f38bdd46e4f7..ce0bb03527e0 100644 > --- a/fs/ext2/ext2.h > +++ b/fs/ext2/ext2.h > @@ -18,6 +18,7 @@ > #include <linux/rbtree.h> > #include <linux/mm.h> > #include <linux/highmem.h> > +#include <linux/rhashtable.h> > > /* XXX Here for now... not interested in restructing headers JUST now */ > > @@ -116,6 +117,8 @@ struct ext2_sb_info { > struct mb_cache *s_ea_block_cache; > struct dax_device *s_daxdev; > u64 s_dax_part_off; > + struct rhashtable buffer_cache; > + spinlock_t buffer_cache_lock; These ought to be in a struct ext2_buffer_cache, so that you can pass a (struct ext2_buffer_cache *) to the {init,destroy}_buffer_cache functions. That improves type safety, because the compiler can check for us that someone doesn't accidentally pass some other rhashtable into destroy_buffer_cache. > }; > > static inline spinlock_t * > @@ -683,6 +686,24 @@ struct ext2_inode_info { > */ > #define EXT2_STATE_NEW 0x00000001 /* inode is newly created */ > > +/* > + * ext2 buffer > + */ > +struct ext2_buffer { > + sector_t b_block; > + struct rhash_head b_rhash; > + struct page *b_page; > + size_t b_size; > + char *b_data; > + unsigned long b_flags; > + refcount_t b_refcount; > + struct mutex b_lock; > +}; > + > +/* > + * Buffer flags > + */ > + #define EXT2_BUF_DIRTY_BIT 0 > > /* > * Function prototypes > @@ -716,6 +737,14 @@ extern int ext2_should_retry_alloc(struct super_block *sb, int *retries); > extern void ext2_init_block_alloc_info(struct inode *); > extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv); > > +/* cache.c */ > +extern int init_buffer_cache(struct rhashtable *); > +extern void destroy_buffer_cache(struct rhashtable *buffer_cache); > +extern int sync_buffers(struct super_block *); > +extern struct ext2_buffer *get_buffer(struct super_block *, sector_t, bool); > +extern void put_buffer(struct ext2_buffer *); > +extern void buffer_set_dirty(struct ext2_buffer *); > + > /* dir.c */ > int ext2_add_link(struct dentry *, struct inode *); > int ext2_inode_by_name(struct inode *dir, > @@ -741,6 +770,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *); > extern void ext2_evict_inode(struct inode *); > void ext2_write_failed(struct address_space *mapping, loff_t to); > extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); > +extern int ext2_get_block_bno(struct inode *, sector_t, int, u32 *, bool *); > extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); > extern int ext2_getattr (struct mnt_idmap *, const struct path *, > struct kstat *, u32, unsigned int); > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > index 0caa1650cee8..7e7e6a5916c4 100644 > --- a/fs/ext2/inode.c > +++ b/fs/ext2/inode.c > @@ -803,6 +803,26 @@ int ext2_get_block(struct inode *inode, sector_t iblock, > > } > > +int ext2_get_block_bno(struct inode *inode, sector_t iblock, > + int create, u32 *bno, bool *mapped) > +{ > + struct super_block *sb = inode->i_sb; > + struct buffer_head tmp_bh; This probably ought to be switched to iomap if the goal is to get rid of buffer heads. But as I mutter somewhere else, maybe the quota code should just copy to and from the quota file pagecache and this effort target the other metadata of ext2. ;) > + int err; > + > + tmp_bh.b_state = 0; > + tmp_bh.b_size = sb->s_blocksize; > + > + err = ext2_get_block(inode, iblock, &tmp_bh, 0); > + if (err) > + return err; > + > + *mapped = buffer_mapped(&tmp_bh); > + *bno = tmp_bh.b_blocknr; > + > + return 0; > +} > > static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > unsigned flags, struct iomap *iomap, struct iomap *srcmap) > { > diff --git a/fs/ext2/super.c b/fs/ext2/super.c > index 37f7ce56adce..11d88882ad24 100644 > --- a/fs/ext2/super.c > +++ b/fs/ext2/super.c > @@ -152,6 +152,8 @@ static void ext2_put_super (struct super_block * sb) > ext2_xattr_destroy_cache(sbi->s_ea_block_cache); > sbi->s_ea_block_cache = NULL; > > + destroy_buffer_cache(&sbi->buffer_cache); > + > if (!sb_rdonly(sb)) { > struct ext2_super_block *es = sbi->s_es; > > @@ -835,6 +837,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) > sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, > NULL, NULL); > > + spin_lock_init(&sbi->buffer_cache_lock); > + ret = init_buffer_cache(&sbi->buffer_cache); > + if (ret) { > + ext2_msg(sb, KERN_ERR, "error: unable to create buffer cache"); > + goto failed_sbi; > + } If the subsequent initialization fails, shouldn't ext2_fill_super call destroy_buffer_cache to free the buffer cache object? > + > spin_lock_init(&sbi->s_lock); > ret = -EINVAL; > > @@ -1278,6 +1287,8 @@ static int ext2_sync_fs(struct super_block *sb, int wait) > */ > dquot_writeback_dquots(sb, -1); > > + sync_buffers(sb); Assuming that sync_buffers (or really, sync_buffer_cache() called on the ext2_buffer_cache) starts returning io errors, any error code should be returned here: int ret = 0, ret2; ret2 = dquot_writeback_dquots(sb, -1); if (!ret && ret2) ret = ret2; ... ret2 = sync_buffer_cache(&sb->buffer_cache); if (!ret && ret2) ret = ret2; ... return ret; > + > spin_lock(&sbi->s_lock); > if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { > ext2_debug("setting valid to 0\n"); > @@ -1491,9 +1502,10 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, > int offset = off & (sb->s_blocksize - 1); > int tocopy; > size_t toread; > - struct buffer_head tmp_bh; > - struct buffer_head *bh; > loff_t i_size = i_size_read(inode); > + struct ext2_buffer *buf; > + u32 bno; > + bool mapped; > > if (off > i_size) > return 0; > @@ -1503,20 +1515,19 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, > while (toread > 0) { > tocopy = min_t(size_t, sb->s_blocksize - offset, toread); > > - tmp_bh.b_state = 0; > - tmp_bh.b_size = sb->s_blocksize; > - err = ext2_get_block(inode, blk, &tmp_bh, 0); > + err = ext2_get_block_bno(inode, blk, 0, &bno, &mapped); > if (err < 0) > return err; > - if (!buffer_mapped(&tmp_bh)) /* A hole? */ > + if (!mapped) /* A hole? */ > memset(data, 0, tocopy); > else { > - bh = sb_bread(sb, tmp_bh.b_blocknr); > - if (!bh) > - return -EIO; > - memcpy(data, bh->b_data+offset, tocopy); > - brelse(bh); > + buf = get_buffer(sb, bno, 1); > + if (IS_ERR(buf)) > + return PTR_ERR(buf); > + memcpy(data, buf->b_data+offset, tocopy); > + put_buffer(buf); I recognize that this is a RFC proof of concept, but I wonder why we don't just copy the dquot information to and from the pagecache? And use the ext2_buffer for other non-file metadata things (e.g. bitmaps, indirect blocks, inodes, group descriptors)? > } > + > offset = 0; > toread -= tocopy; > data += tocopy; > @@ -1535,32 +1546,29 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, > int offset = off & (sb->s_blocksize - 1); > int tocopy; > size_t towrite = len; > - struct buffer_head tmp_bh; > - struct buffer_head *bh; > + struct ext2_buffer *buf; > + u32 bno; > + bool mapped; > > while (towrite > 0) { > tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); > > - tmp_bh.b_state = 0; > - tmp_bh.b_size = sb->s_blocksize; > - err = ext2_get_block(inode, blk, &tmp_bh, 1); > + err = ext2_get_block_bno(inode, blk, 1, &bno, &mapped); > if (err < 0) > goto out; > + > if (offset || tocopy != EXT2_BLOCK_SIZE(sb)) > - bh = sb_bread(sb, tmp_bh.b_blocknr); > + buf = get_buffer(sb, bno, 1); > else > - bh = sb_getblk(sb, tmp_bh.b_blocknr); > - if (unlikely(!bh)) { > - err = -EIO; > + buf = get_buffer(sb, bno, 0); > + if (IS_ERR(buf)) { > + err = PTR_ERR(buf); > goto out; > } > - lock_buffer(bh); > - memcpy(bh->b_data+offset, data, tocopy); > - flush_dcache_page(bh->b_page); > - set_buffer_uptodate(bh); > - mark_buffer_dirty(bh); > - unlock_buffer(bh); > - brelse(bh); > + memcpy(buf->b_data+offset, data, tocopy); > + buffer_set_dirty(buf); I also wonder if this could be lifted a buffer_write() helper: void buffer_write(struct ext2_buffer *buf, void *data, unsigned int count, unsigned int offset) { WARN_ON(offset + count > buf->b_size); memcpy(buf->b_data + offset, data, count); buffer_set_dirty(buf); } buffer_write(buf, data, offset, tocopy); > + put_buffer(buf); > + > offset = 0; > towrite -= tocopy; > data += tocopy; > -- > 2.43.0 > >