On 3/10/22 8:27 AM, Jeff Layton wrote:
On Thu, 2022-03-10 at 08:20 +0800, Xiubo Li wrote:On 3/9/22 10:50 PM, Jeff Layton wrote:On Wed, 2022-03-09 at 21:59 +0800, xiubli@xxxxxxxxxx wrote:From: Xiubo Li <xiubli@xxxxxxxxxx> For the readdir request the dentries will be pasred and dencrypted in ceph_readdir_prepopulate(). And in ceph_readdir() we could just get the dentry name from the dentry cache instead of parsing and dencrypting them again. This could improve performance. Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> --- V7: - Fix the xfstest generic/006 crash bug about the rde->dentry == NULL. V6: - Remove CEPH_ENCRYPTED_LONG_SNAP_NAME_MAX macro and use the NAME_MAX instead, since we are limiting the max length of snapshot name to 240, which is NAME_MAX - 2 * sizeof('_') - sizeof(<inode#>). V5: - fix typo of CEPH_ENCRYPTED_LONG_SNAP_NAME_MAX macro - release the rde->dentry in destroy_reply_info fs/ceph/dir.c | 56 ++++++++++++++++++++------------------------ fs/ceph/inode.c | 7 ++++++ fs/ceph/mds_client.c | 1 + fs/ceph/mds_client.h | 1 + 4 files changed, 34 insertions(+), 31 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6df2a91af236..2397c34e9173 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -316,8 +316,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) int err; unsigned frag = -1; struct ceph_mds_reply_info_parsed *rinfo; - struct fscrypt_str tname = FSTR_INIT(NULL, 0); - struct fscrypt_str oname = FSTR_INIT(NULL, 0); + char *dentry_name = NULL;dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);if (dfi->file_info.flags & CEPH_F_ATEND) @@ -369,14 +368,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) spin_unlock(&ci->i_ceph_lock); }- err = ceph_fname_alloc_buffer(inode, &tname);- if (err < 0) - goto out; - - err = ceph_fname_alloc_buffer(inode, &oname); - if (err < 0) - goto out; - /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ @@ -528,31 +519,36 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) } } } + + dentry_name = kmalloc(NAME_MAX, GFP_KERNEL); + if (!dentry_name) { + err = -ENOMEM; + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; + goto out; + } + for (; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - struct ceph_fname fname = { .dir = inode, - .name = rde->name, - .name_len = rde->name_len, - .ctext = rde->altname, - .ctext_len = rde->altname_len }; - u32 olen = oname.len; - - err = ceph_fname_to_usr(&fname, &tname, &oname, NULL); - if (err) { - pr_err("%s unable to decode %.*s, got %d\n", __func__, - rde->name_len, rde->name, err); - goto out; - } + struct dentry *dn = rde->dentry; + int name_len;BUG_ON(rde->offset < ctx->pos);BUG_ON(!rde->inode.in); + BUG_ON(!rde->dentry);ctx->pos = rde->offset;- dout("readdir (%d/%d) -> %llx '%.*s' %p\n", - i, rinfo->dir_nr, ctx->pos, - rde->name_len, rde->name, &rde->inode.in);- if (!dir_emit(ctx, oname.name, oname.len,+ spin_lock(&dn->d_lock); + memcpy(dentry_name, dn->d_name.name, dn->d_name.len); + name_len = dn->d_name.len; + spin_unlock(&dn->d_lock); + + dentry_name[name_len] = '\0'; + dout("readdir (%d/%d) -> %llx '%s' %p\n", + i, rinfo->dir_nr, ctx->pos, dentry_name, &rde->inode.in); + + if (!dir_emit(ctx, dentry_name, name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { /* @@ -566,8 +562,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) goto out; }- /* Reset the lengths to their original allocated vals */- oname.len = olen; ctx->pos++; }@@ -625,8 +619,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)err = 0; dout("readdir %p file %p done.\n", inode, file); out: - ceph_fname_free_buffer(inode, &tname); - ceph_fname_free_buffer(inode, &oname); + if (dentry_name) + kfree(dentry_name); return err; }diff --git a/fs/ceph/inode.c b/fs/ceph/inode.cindex b573a0f33450..19e5275eae1c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1909,6 +1909,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, goto out; }+ rde->dentry = NULL;dname.name = oname.name; dname.len = oname.len; dname.hash = full_name_hash(parent, dname.name, dname.len); @@ -1969,6 +1970,12 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, goto retry_lookup; }+ /*+ * ceph_readdir will use the dentry to get the name + * to avoid doing the dencrypt again there. + */ + rde->dentry = dget(dn); + /* inode */ if (d_really_is_positive(dn)) { in = d_inode(dn); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8d704ddd7291..9e0a51ef1dfa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -733,6 +733,7 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)kfree(rde->inode.fscrypt_auth);kfree(rde->inode.fscrypt_file); + dput(rde->dentry); } free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0dfe24f94567..663d7754d57d 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -96,6 +96,7 @@ struct ceph_mds_reply_info_in { };struct ceph_mds_reply_dir_entry {+ struct dentry *dentry; char *name; u8 *altname; u32 name_len;Still buggy. This time generic/013 triggered this: [ 1970.839019] run fstests generic/013 at 2022-03-09 09:48:42 [ 2001.133838] ================================================================== [ 2001.138595] BUG: KASAN: slab-out-of-bounds in ceph_readdir+0x3f4/0x1dd0 [ceph] [ 2001.141997] Write of size 1 at addr ffff888120070aff by task fsstress/8682 [ 2001.144897] [ 2001.145670] CPU: 7 PID: 8682 Comm: fsstress Tainted: G E 5.17.0-rc6+ #172 [ 2001.149132] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1.fc35 04/01/2014 [ 2001.152477] Call Trace: [ 2001.153609] <TASK> [ 2001.154482] dump_stack_lvl+0x59/0x73 [ 2001.155697] print_address_description.constprop.0+0x1f/0x150 [ 2001.158021] ? ceph_readdir+0x3f4/0x1dd0 [ceph] [ 2001.159654] kasan_report.cold+0x7f/0x11b [ 2001.160951] ? ceph_readdir+0x3f4/0x1dd0 [ceph] [ 2001.168084] ceph_readdir+0x3f4/0x1dd0 [ceph] [ 2001.173258] ? ceph_d_revalidate+0x7e0/0x7e0 [ceph] [ 2001.178293] ? down_write_killable+0xc7/0x130 [ 2001.182782] ? __down_interruptible+0x1d0/0x1d0 [ 2001.187246] iterate_dir+0x107/0x2e0 [ 2001.192677] __x64_sys_getdents64+0xe2/0x1b0 [ 2001.197570] ? filldir+0x270/0x270 [ 2001.202806] ? __ia32_sys_getdents+0x1a0/0x1a0 [ 2001.207415] ? lockdep_hardirqs_on_prepare+0x129/0x220 [ 2001.211782] ? syscall_enter_from_user_mode+0x21/0x70 [ 2001.216466] do_syscall_64+0x3b/0x90 [ 2001.220674] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 2001.225674] RIP: 0033:0x7f159a774fd7 [ 2001.230170] Code: 19 fb ff 4c 89 e0 5b 5d 41 5c c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 ff ff ff 7f 48 39 c2 48 0f 47 d0 b8 d9 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 21 0e 12 00 f7 d8 64 89 02 48 [ 2001.240821] RSP: 002b:00007ffff90c35d8 EFLAGS: 00000293 ORIG_RAX: 00000000000000d9 [ 2001.247670] RAX: ffffffffffffffda RBX: 0000000001186130 RCX: 00007f159a774fd7 [ 2001.255057] RDX: 0000000000010000 RSI: 0000000001186130 RDI: 0000000000000003 [ 2001.262428] RBP: 0000000001186104 R08: 0000000000000003 R09: 0000000000000000 [ 2001.269057] R10: 0000000000000000 R11: 0000000000000293 R12: ffffffffffffff80 [ 2001.275079] R13: 0000000000000019 R14: 0000000001186100 R15: 00007f159a6996c0 [ 2001.282429] </TASK> [ 2001.287900] [ 2001.291963] Allocated by task 8682: [ 2001.296303] kasan_save_stack+0x1e/0x40 [ 2001.300491] __kasan_kmalloc+0xa9/0xd0 [ 2001.304792] ceph_readdir+0x2ab/0x1dd0 [ceph] [ 2001.309093] iterate_dir+0x107/0x2e0 [ 2001.313291] __x64_sys_getdents64+0xe2/0x1b0 [ 2001.317705] do_syscall_64+0x3b/0x90 [ 2001.322210] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 2001.327112] [ 2001.331092] The buggy address belongs to the object at ffff888120070a00 [ 2001.331092] which belongs to the cache kmalloc-256 of size 256 [ 2001.341495] The buggy address is located 255 bytes inside of [ 2001.341495] 256-byte region [ffff888120070a00, ffff888120070b00) [ 2001.352252] The buggy address belongs to the page: [ 2001.357836] page:0000000089466360 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x120070 [ 2001.364225] head:0000000089466360 order:2 compound_mapcount:0 compound_pincount:0 [ 2001.370091] flags: 0x17ffffc0010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff) [ 2001.375349] raw: 0017ffffc0010200 ffffea00048d7e00 dead000000000002 ffff888100042b40 [ 2001.380833] raw: 0000000000000000 0000000080200020 00000001ffffffff 0000000000000000 [ 2001.386079] page dumped because: kasan: bad access detected [ 2001.390919] [ 2001.395163] Memory state around the buggy address: [ 2001.399901] ffff888120070980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 2001.405479] ffff888120070a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 2001.410888] >ffff888120070a80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 07 [ 2001.416090] ^ [ 2001.422577] ffff888120070b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 2001.430562] ffff888120070b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 2001.438474] ================================================================== [ 2001.446565] Disabling lock debugging due to kernel taintHi Jeff, Yesterday I also test this and others, and I couldn't reproduce it. Just now I tried several time still worked well. [root@lxbceph1 xfstests]# ./check generic/013 FSTYP -- ceph PLATFORM -- Linux/x86_64 lxbceph1 5.17.0-rc6+ MKFS_OPTIONS -- 10.72.47.117:40437:/testB MOUNT_OPTIONS -- -o test_dummy_encryption,name=admin,nowsync,copyfrom,rasize=4096 -o context=system_u:object_r:root_t:s0 10.72.47.117:40437:/testB /mnt/kcephfs.B generic/013 102s ... 99s Ran: generic/013 Passed all 1 tests I there anything different with yours ? Such as the options, etc.Do you have KASAN turned on? I generally run with that enabled for most of my testing (for this reason).
Yeah, me too. The config is: 8443 CONFIG_KASAN=y 8444 CONFIG_KASAN_GENERIC=y 8445 CONFIG_KASAN_OUTLINE=y 8446 # CONFIG_KASAN_INLINE is not set 8447 CONFIG_KASAN_STACK=y 8448 # CONFIG_KASAN_VMALLOC is not set 8449 # CONFIG_KASAN_MODULE_TEST is not set 8450 CONFIG_HAVE_ARCH_KFENCE=y 8451 # CONFIG_KFENCE is not set 8452 # end of Memory Debugging 8453 8454 CONFIG_DEBUG_SHIRQ=y
Also, I was testing with '-o test_dummy_encryption' enabled as well. That may be a factor here.
Please see my above test I also added this.BTW, could you attach your objdump for fs/ceph/dir.o ? Mine seem different with the dump stack you past.
[ 2001.168084] ceph_readdir+0x3f4/0x1dd0 [ceph]The totoal length is 0x1dd0 ? But mine is larger than this. Not sure whether there has other config is different with yours.
I base the latest wip-fscrypt branch. - XIubo