On Mon, Mar 04, 2024 at 08:10:30PM +0100, Andrey Albershteyn wrote: > In the current implementation fs-verity expects filesystem to > provide PAGEs filled with Merkle tree blocks. Then, when fs-verity > is done with processing the blocks, reference to PAGE is freed. This > doesn't fit well with the way XFS manages its memory. > > To allow XFS integrate fs-verity this patch adds ability to > fs-verity verification code to take Merkle tree blocks instead of > PAGE reference. This way ext4, f2fs, and btrfs are still able to > pass PAGE references and XFS can pass reference to Merkle tree > blocks stored in XFS's buffer infrastructure. > > Another addition is invalidation function which tells fs-verity to > mark part of Merkle tree as not verified. This function is used > by filesystem to tell fs-verity to invalidate block which was > evicted from memory. > > Depending on Merkle tree block size fs-verity is using either bitmap > or PG_checked flag to track "verified" status of the blocks. With a > Merkle tree block caching (XFS) there is no PAGE to flag it as > verified. fs-verity always uses bitmap to track verified blocks for > filesystems which use block caching. > > Further this patch allows filesystem to make additional processing > on verified pages via fsverity_drop_block() instead of just dropping > a reference. This will be used by XFS for internal buffer cache > manipulation in further patches. The btrfs, ext4, and f2fs just drop > the reference. > > Signed-off-by: Andrey Albershteyn <aalbersh@xxxxxxxxxx> > --- > fs/verity/fsverity_private.h | 8 +++ > fs/verity/open.c | 8 ++- > fs/verity/read_metadata.c | 64 +++++++++++------- > fs/verity/verify.c | 125 +++++++++++++++++++++++++++-------- > include/linux/fsverity.h | 65 ++++++++++++++++++ > 5 files changed, 217 insertions(+), 53 deletions(-) > > diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h > index b3506f56e180..dad33e6ff0d6 100644 > --- a/fs/verity/fsverity_private.h > +++ b/fs/verity/fsverity_private.h > @@ -154,4 +154,12 @@ static inline void fsverity_init_signature(void) > > void __init fsverity_init_workqueue(void); > > +/* > + * Drop 'block' obtained with ->read_merkle_tree_block(). Calls out back to > + * filesystem if ->drop_block() is set, otherwise, drop the reference in the > + * block->context. > + */ > +void fsverity_drop_block(struct inode *inode, > + struct fsverity_blockbuf *block); > + > #endif /* _FSVERITY_PRIVATE_H */ > diff --git a/fs/verity/open.c b/fs/verity/open.c > index fdeb95eca3af..6e6922b4b014 100644 > --- a/fs/verity/open.c > +++ b/fs/verity/open.c > @@ -213,7 +213,13 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, > if (err) > goto fail; > > - if (vi->tree_params.block_size != PAGE_SIZE) { > + /* > + * If fs passes Merkle tree blocks to fs-verity (e.g. XFS), then > + * fs-verity should use hash_block_verified bitmap as there's no page > + * to mark it with PG_checked. > + */ > + if (vi->tree_params.block_size != PAGE_SIZE || > + inode->i_sb->s_vop->read_merkle_tree_block) { > /* > * When the Merkle tree block size and page size differ, we use > * a bitmap to keep track of which hash blocks have been > diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c > index f58432772d9e..5da40b5a81af 100644 > --- a/fs/verity/read_metadata.c > +++ b/fs/verity/read_metadata.c > @@ -18,50 +18,68 @@ static int fsverity_read_merkle_tree(struct inode *inode, > { > const struct fsverity_operations *vops = inode->i_sb->s_vop; > u64 end_offset; > - unsigned int offs_in_page; > + unsigned int offs_in_block; > pgoff_t index, last_index; > int retval = 0; > int err = 0; > + const unsigned int block_size = vi->tree_params.block_size; > + const u8 log_blocksize = vi->tree_params.log_blocksize; > > end_offset = min(offset + length, vi->tree_params.tree_size); > if (offset >= end_offset) > return 0; > - offs_in_page = offset_in_page(offset); > - last_index = (end_offset - 1) >> PAGE_SHIFT; > + offs_in_block = offset & (block_size - 1); > + last_index = (end_offset - 1) >> log_blocksize; > > /* > - * Iterate through each Merkle tree page in the requested range and copy > - * the requested portion to userspace. Note that the Merkle tree block > - * size isn't important here, as we are returning a byte stream; i.e., > - * we can just work with pages even if the tree block size != PAGE_SIZE. > + * Iterate through each Merkle tree block in the requested range and > + * copy the requested portion to userspace. Note that we are returning > + * a byte stream. > */ > - for (index = offset >> PAGE_SHIFT; index <= last_index; index++) { > + for (index = offset >> log_blocksize; index <= last_index; index++) { > unsigned long num_ra_pages = > min_t(unsigned long, last_index - index + 1, > inode->i_sb->s_bdi->io_pages); > unsigned int bytes_to_copy = min_t(u64, end_offset - offset, > - PAGE_SIZE - offs_in_page); > - struct page *page; > - const void *virt; > + block_size - offs_in_block); > + struct fsverity_blockbuf block = { > + .size = block_size, > + }; > > - page = vops->read_merkle_tree_page(inode, index, num_ra_pages); > - if (IS_ERR(page)) { > - err = PTR_ERR(page); > + if (!vops->read_merkle_tree_block) { > + unsigned int blocks_per_page = > + vi->tree_params.blocks_per_page; > + unsigned long page_idx = > + round_down(index, blocks_per_page); > + struct page *page = vops->read_merkle_tree_page(inode, > + page_idx, num_ra_pages); > + > + if (IS_ERR(page)) { > + err = PTR_ERR(page); > + } else { > + block.kaddr = kmap_local_page(page) + > + ((index - page_idx) << log_blocksize); > + block.context = page; > + } > + } else { > + err = vops->read_merkle_tree_block(inode, > + index << log_blocksize, > + &block, log_blocksize, num_ra_pages); > + } > + > + if (err) { > fsverity_err(inode, > - "Error %d reading Merkle tree page %lu", > - err, index); > + "Error %d reading Merkle tree block %lu", > + err, index << log_blocksize); > break; > } > > - virt = kmap_local_page(page); > - if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { > - kunmap_local(virt); > - put_page(page); > + if (copy_to_user(buf, block.kaddr + offs_in_block, bytes_to_copy)) { > + fsverity_drop_block(inode, &block); > err = -EFAULT; > break; > } > - kunmap_local(virt); > - put_page(page); > + fsverity_drop_block(inode, &block); > > retval += bytes_to_copy; > buf += bytes_to_copy; > @@ -72,7 +90,7 @@ static int fsverity_read_merkle_tree(struct inode *inode, > break; > } > cond_resched(); > - offs_in_page = 0; > + offs_in_block = 0; > } > return retval ? retval : err; > } > diff --git a/fs/verity/verify.c b/fs/verity/verify.c > index 4fcad0825a12..de71911d400c 100644 > --- a/fs/verity/verify.c > +++ b/fs/verity/verify.c > @@ -13,14 +13,17 @@ > static struct workqueue_struct *fsverity_read_workqueue; > > /* > - * Returns true if the hash block with index @hblock_idx in the tree, located in > - * @hpage, has already been verified. > + * Returns true if the hash block with index @hblock_idx in the tree has > + * already been verified. > */ > -static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, > +static bool is_hash_block_verified(struct inode *inode, > + struct fsverity_blockbuf *block, > unsigned long hblock_idx) > { > unsigned int blocks_per_page; > unsigned int i; > + struct fsverity_info *vi = inode->i_verity_info; > + struct page *hpage = (struct page *)block->context; > > /* > * When the Merkle tree block size and page size are the same, then the > @@ -34,6 +37,12 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, > if (!vi->hash_block_verified) > return PageChecked(hpage); > > + /* > + * Filesystems which use block based caching (e.g. XFS) always use > + * bitmap. > + */ > + if (inode->i_sb->s_vop->read_merkle_tree_block) > + return test_bit(hblock_idx, vi->hash_block_verified); > /* > * When the Merkle tree block size and page size differ, we use a bitmap > * to indicate whether each hash block has been verified. > @@ -95,15 +104,15 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, > const struct merkle_tree_params *params = &vi->tree_params; > const unsigned int hsize = params->digest_size; > int level; > + int err; FWIW, Dan Carpenter complained that err can be used uninitialized here... > + int num_ra_pages; > u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE]; > const u8 *want_hash; > u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; > /* The hash blocks that are traversed, indexed by level */ > struct { > - /* Page containing the hash block */ > - struct page *page; > - /* Mapped address of the hash block (will be within @page) */ > - const void *addr; > + /* Buffer containing the hash block */ > + struct fsverity_blockbuf block; > /* Index of the hash block in the tree overall */ > unsigned long index; > /* Byte offset of the wanted hash relative to @addr */ > @@ -144,10 +153,11 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, > unsigned long next_hidx; > unsigned long hblock_idx; > pgoff_t hpage_idx; > + u64 hblock_pos; > unsigned int hblock_offset_in_page; > unsigned int hoffset; > struct page *hpage; > - const void *haddr; > + struct fsverity_blockbuf *block = &hblocks[level].block; > > /* > * The index of the block in the current level; also the index > @@ -165,29 +175,49 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, > hblock_offset_in_page = > (hblock_idx << params->log_blocksize) & ~PAGE_MASK; > > + /* Offset of the Merkle tree block into the tree */ > + hblock_pos = hblock_idx << params->log_blocksize; > + > /* Byte offset of the hash within the block */ > hoffset = (hidx << params->log_digestsize) & > (params->block_size - 1); > > - hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, > - hpage_idx, level == 0 ? min(max_ra_pages, > - params->tree_pages - hpage_idx) : 0); > - if (IS_ERR(hpage)) { > + num_ra_pages = level == 0 ? > + min(max_ra_pages, params->tree_pages - hpage_idx) : 0; > + > + if (inode->i_sb->s_vop->read_merkle_tree_block) { > + err = inode->i_sb->s_vop->read_merkle_tree_block( > + inode, hblock_pos, block, params->log_blocksize, > + num_ra_pages); > + } else { > + unsigned int blocks_per_page = > + vi->tree_params.blocks_per_page; > + hblock_idx = round_down(hblock_idx, blocks_per_page); > + hpage = inode->i_sb->s_vop->read_merkle_tree_page( > + inode, hpage_idx, (num_ra_pages << PAGE_SHIFT)); > + > + if (IS_ERR(hpage)) { > + err = PTR_ERR(hpage); > + } else { > + block->kaddr = kmap_local_page(hpage) + > + hblock_offset_in_page; > + block->context = hpage; ...in the case where hpage is not an error. I'll fix that in my tree. https://lore.kernel.org/oe-kbuild-all/f9e79efc-a16f-495d-8e53-d5a9eef5936e@moroto.mountain/T/#u --D > + } > + } > + > + if (err) { > fsverity_err(inode, > - "Error %ld reading Merkle tree page %lu", > - PTR_ERR(hpage), hpage_idx); > + "Error %d reading Merkle tree block %lu", > + err, hblock_idx); > goto error; > } > - haddr = kmap_local_page(hpage) + hblock_offset_in_page; > - if (is_hash_block_verified(vi, hpage, hblock_idx)) { > - memcpy(_want_hash, haddr + hoffset, hsize); > + > + if (is_hash_block_verified(inode, block, hblock_idx)) { > + memcpy(_want_hash, block->kaddr + hoffset, hsize); > want_hash = _want_hash; > - kunmap_local(haddr); > - put_page(hpage); > + fsverity_drop_block(inode, block); > goto descend; > } > - hblocks[level].page = hpage; > - hblocks[level].addr = haddr; > hblocks[level].index = hblock_idx; > hblocks[level].hoffset = hoffset; > hidx = next_hidx; > @@ -197,10 +227,11 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, > descend: > /* Descend the tree verifying hash blocks. */ > for (; level > 0; level--) { > - struct page *hpage = hblocks[level - 1].page; > - const void *haddr = hblocks[level - 1].addr; > + struct fsverity_blockbuf *block = &hblocks[level - 1].block; > + const void *haddr = block->kaddr; > unsigned long hblock_idx = hblocks[level - 1].index; > unsigned int hoffset = hblocks[level - 1].hoffset; > + struct page *hpage = (struct page *)block->context; > > if (fsverity_hash_block(params, inode, haddr, real_hash) != 0) > goto error; > @@ -217,8 +248,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, > SetPageChecked(hpage); > memcpy(_want_hash, haddr + hoffset, hsize); > want_hash = _want_hash; > - kunmap_local(haddr); > - put_page(hpage); > + fsverity_drop_block(inode, block); > } > > /* Finally, verify the data block. */ > @@ -235,10 +265,8 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, > params->hash_alg->name, hsize, want_hash, > params->hash_alg->name, hsize, real_hash); > error: > - for (; level > 0; level--) { > - kunmap_local(hblocks[level - 1].addr); > - put_page(hblocks[level - 1].page); > - } > + for (; level > 0; level--) > + fsverity_drop_block(inode, &hblocks[level - 1].block); > return false; > } > > @@ -362,3 +390,42 @@ void __init fsverity_init_workqueue(void) > if (!fsverity_read_workqueue) > panic("failed to allocate fsverity_read_queue"); > } > + > +/** > + * fsverity_invalidate_block() - invalidate Merkle tree block > + * @inode: inode to which this Merkle tree blocks belong > + * @block: block to be invalidated > + * > + * This function invalidates/clears "verified" state of Merkle tree block > + * in the fs-verity bitmap. The block needs to have ->offset set. > + */ > +void fsverity_invalidate_block(struct inode *inode, > + struct fsverity_blockbuf *block) > +{ > + struct fsverity_info *vi = inode->i_verity_info; > + const unsigned int log_blocksize = vi->tree_params.log_blocksize; > + > + if (block->offset > vi->tree_params.tree_size) { > + fsverity_err(inode, > +"Trying to invalidate beyond Merkle tree (tree %lld, offset %lld)", > + vi->tree_params.tree_size, block->offset); > + return; > + } > + > + clear_bit(block->offset >> log_blocksize, vi->hash_block_verified); > +} > +EXPORT_SYMBOL_GPL(fsverity_invalidate_block); > + > +void fsverity_drop_block(struct inode *inode, > + struct fsverity_blockbuf *block) > +{ > + if (inode->i_sb->s_vop->drop_block) > + inode->i_sb->s_vop->drop_block(block); > + else { > + struct page *page = (struct page *)block->context; > + > + kunmap_local(block->kaddr); > + put_page(page); > + } > + block->kaddr = NULL; > +} > diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h > index ac58b19f23d3..0973b521ac5a 100644 > --- a/include/linux/fsverity.h > +++ b/include/linux/fsverity.h > @@ -26,6 +26,33 @@ > /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ > #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 > > +/** > + * struct fsverity_blockbuf - Merkle Tree block buffer > + * @kaddr: virtual address of the block's data > + * @offset: block's offset into Merkle tree > + * @size: the Merkle tree block size > + * @context: filesystem private context > + * > + * Buffer containing single Merkle Tree block. These buffers are passed > + * - to filesystem, when fs-verity is building merkel tree, > + * - from filesystem, when fs-verity is reading merkle tree from a disk. > + * Filesystems sets kaddr together with size to point to a memory which contains > + * Merkle tree block. Same is done by fs-verity when Merkle tree is need to be > + * written down to disk. > + * > + * While reading the tree, fs-verity calls ->read_merkle_tree_block followed by > + * ->drop_block to let filesystem know that memory can be freed. > + * > + * The context is optional. This field can be used by filesystem to passthrough > + * state from ->read_merkle_tree_block to ->drop_block. > + */ > +struct fsverity_blockbuf { > + void *kaddr; > + u64 offset; > + unsigned int size; > + void *context; > +}; > + > /* Verity operations for filesystems */ > struct fsverity_operations { > > @@ -107,6 +134,32 @@ struct fsverity_operations { > pgoff_t index, > unsigned long num_ra_pages); > > + /** > + * Read a Merkle tree block of the given inode. > + * @inode: the inode > + * @pos: byte offset of the block within the Merkle tree > + * @block: block buffer for filesystem to point it to the block > + * @log_blocksize: log2 of the size of the expected block > + * @ra_bytes: The number of bytes that should be > + * prefetched starting at @pos if the page at @pos > + * isn't already cached. Implementations may ignore this > + * argument; it's only a performance optimization. > + * > + * This can be called at any time on an open verity file. It may be > + * called by multiple processes concurrently. > + * > + * In case that block was evicted from the memory filesystem has to use > + * fsverity_invalidate_block() to let fsverity know that block's > + * verification state is not valid anymore. > + * > + * Return: 0 on success, -errno on failure > + */ > + int (*read_merkle_tree_block)(struct inode *inode, > + u64 pos, > + struct fsverity_blockbuf *block, > + unsigned int log_blocksize, > + u64 ra_bytes); > + > /** > * Write a Merkle tree block to the given inode. > * > @@ -122,6 +175,16 @@ struct fsverity_operations { > */ > int (*write_merkle_tree_block)(struct inode *inode, const void *buf, > u64 pos, unsigned int size); > + > + /** > + * Release the reference to a Merkle tree block > + * > + * @block: the block to release > + * > + * This is called when fs-verity is done with a block obtained with > + * ->read_merkle_tree_block(). > + */ > + void (*drop_block)(struct fsverity_blockbuf *block); > }; > > #ifdef CONFIG_FS_VERITY > @@ -175,6 +238,8 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); > bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset); > void fsverity_verify_bio(struct bio *bio); > void fsverity_enqueue_verify_work(struct work_struct *work); > +void fsverity_invalidate_block(struct inode *inode, > + struct fsverity_blockbuf *block); > > #else /* !CONFIG_FS_VERITY */ > > -- > 2.42.0 > >