Re: [PATCH v5] ceph: do not dencrypt the dentry name twice for readdir

Luís Henriques <lhenriques@xxxxxxx> · Tue, 08 Mar 2022 17:47:45 +0000

xiubli@xxxxxxxxxx writes:

> From: Xiubo Li <xiubli@xxxxxxxxxx>
>
> For the readdir request the dentries will be pasred and dencrypted
> in ceph_readdir_prepopulate(). And in ceph_readdir() we could just
> get the dentry name from the dentry cache instead of parsing and
> dencrypting them again. This could improve performance.
>
> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> ---
>
> V5:
> - fix typo of CEPH_ENCRYPTED_LONG_SNAP_NAME_MAX macro
> - release the rde->dentry in destroy_reply_info
>
>
>  fs/ceph/crypto.h     |  8 ++++++
>  fs/ceph/dir.c        | 59 +++++++++++++++++++++-----------------------
>  fs/ceph/inode.c      |  7 ++++++
>  fs/ceph/mds_client.c |  2 ++
>  fs/ceph/mds_client.h |  1 +
>  5 files changed, 46 insertions(+), 31 deletions(-)
>
> diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
> index 1e08f8a64ad6..c85cb8c8bd79 100644
> --- a/fs/ceph/crypto.h
> +++ b/fs/ceph/crypto.h
> @@ -83,6 +83,14 @@ static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa)
>   */
>  #define CEPH_NOHASH_NAME_MAX (189 - SHA256_DIGEST_SIZE)
>  
> +/*
> + * The encrypted long snap name will be in format of
> + * "_${ENCRYPTED-LONG-SNAP-NAME}_${INODE-NUM}". And will set the max longth
> + * to sizeof('_') + NAME_MAX + sizeof('_') + max of sizeof(${INO}) + extra 7
> + * bytes to align the total size to 8 bytes.
> + */
> +#define CEPH_ENCRYPTED_LONG_SNAP_NAME_MAX (1 + 255 + 1 + 16 + 7)
> +

I think this constant needs to be defined in a different way and we need
to keep the snapshots names length a bit shorter than NAME_MAX.  And I'm
not talking just about the encrypted snapshots.

Right now, ceph PR#45192 fixes an MDS limitation that is keeping long
snapshot names smaller than 80 characters.  With this limitation we would
need to keep the snapshot names < 64:

   '_' + <name> + '_' + '<inode#>' '\0'
    1  +   64   +  1  +    12     +  1 = 80

Note however that currently clients *do* allow to create snapshots with
bigger names.  And if we do that we'll get an error when doing an LSSNAP
on a .snap subdirectory that will contain the corresponding long name:

  # mkdir a/.snap/123456qwertasdfgzxcvb7890yuiophjklnm123456qwertasdfgzxcvb78912345
  # ls -li a/b/.snap
  ls: a/b/.snap/_123456qwertasdfgzxcvb7890yuiophjklnm123456qwertasdfgzxcvb78912345_109951162777: No such file or directory

We can limit the snapshot names on creation, but this should probably be
handled on the MDS side (so that old clients won't break anything).  Does
this make sense?  I can work on an MDS patch for this but... to which
length should names be limited? NAME_MAX - (2*'_' + <inode len>)?  Or
should we take base64-encoded names already into account?

(Sorry, I'm jumping around between PRs and patches, and trying to make any
sense out of the snapshots code :-/ )

Cheers,
-- 
Luís

>  void ceph_fscrypt_set_ops(struct super_block *sb);
>  
>  void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc);
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 6df2a91af236..b30429e2d079 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -316,8 +316,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	int err;
>  	unsigned frag = -1;
>  	struct ceph_mds_reply_info_parsed *rinfo;
> -	struct fscrypt_str tname = FSTR_INIT(NULL, 0);
> -	struct fscrypt_str oname = FSTR_INIT(NULL, 0);
> +	char *dentry_name = NULL;
>  
>  	dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
>  	if (dfi->file_info.flags & CEPH_F_ATEND)
> @@ -369,14 +368,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  		spin_unlock(&ci->i_ceph_lock);
>  	}
>  
> -	err = ceph_fname_alloc_buffer(inode, &tname);
> -	if (err < 0)
> -		goto out;
> -
> -	err = ceph_fname_alloc_buffer(inode, &oname);
> -	if (err < 0)
> -		goto out;
> -
>  	/* proceed with a normal readdir */
>  more:
>  	/* do we have the correct frag content buffered? */
> @@ -528,31 +519,39 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  			}
>  		}
>  	}
> +
> +	dentry_name = kmalloc(CEPH_ENCRYPTED_LONG_SNAP_NAME_MAX, GFP_KERNEL);
> +	if (!dentry_name) {
> +		err = -ENOMEM;
> +		ceph_mdsc_put_request(dfi->last_readdir);
> +		dfi->last_readdir = NULL;
> +		goto out;
> +	}
> +
>  	for (; i < rinfo->dir_nr; i++) {
>  		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> -		struct ceph_fname fname = { .dir	= inode,
> -					    .name	= rde->name,
> -					    .name_len	= rde->name_len,
> -					    .ctext	= rde->altname,
> -					    .ctext_len	= rde->altname_len };
> -		u32 olen = oname.len;
> -
> -		err = ceph_fname_to_usr(&fname, &tname, &oname, NULL);
> -		if (err) {
> -			pr_err("%s unable to decode %.*s, got %d\n", __func__,
> -			       rde->name_len, rde->name, err);
> -			goto out;
> -		}
> +		struct dentry *dn = rde->dentry;
> +		int name_len;
>  
>  		BUG_ON(rde->offset < ctx->pos);
>  		BUG_ON(!rde->inode.in);
> +		BUG_ON(!rde->dentry);
>  
>  		ctx->pos = rde->offset;
> -		dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
> -		     i, rinfo->dir_nr, ctx->pos,
> -		     rde->name_len, rde->name, &rde->inode.in);
>  
> -		if (!dir_emit(ctx, oname.name, oname.len,
> +		spin_lock(&dn->d_lock);
> +		memcpy(dentry_name, dn->d_name.name, dn->d_name.len);
> +		name_len = dn->d_name.len;
> +		spin_unlock(&dn->d_lock);
> +
> +		dentry_name[name_len] = '\0';
> +		dout("readdir (%d/%d) -> %llx '%s' %p\n",
> +		     i, rinfo->dir_nr, ctx->pos, dentry_name, &rde->inode.in);
> +
> +		dput(dn);
> +		rde->dentry = NULL;
> +
> +		if (!dir_emit(ctx, dentry_name, name_len,
>  			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
>  			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
>  			/*
> @@ -566,8 +565,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  			goto out;
>  		}
>  
> -		/* Reset the lengths to their original allocated vals */
> -		oname.len = olen;
>  		ctx->pos++;
>  	}
>  
> @@ -625,8 +622,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	err = 0;
>  	dout("readdir %p file %p done.\n", inode, file);
>  out:
> -	ceph_fname_free_buffer(inode, &tname);
> -	ceph_fname_free_buffer(inode, &oname);
> +	if (dentry_name)
> +		kfree(dentry_name);
>  	return err;
>  }
>  
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index e5a9838981ba..dfb7b4461857 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -1902,6 +1902,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
>  			goto out;
>  		}
>  
> +		rde->dentry = NULL;
>  		dname.name = oname.name;
>  		dname.len = oname.len;
>  		dname.hash = full_name_hash(parent, dname.name, dname.len);
> @@ -1962,6 +1963,12 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
>  			goto retry_lookup;
>  		}
>  
> +		/*
> +		 * ceph_readdir will use the dentry to get the name
> +		 * to avoid doing the dencrypt again there.
> +		 */
> +		rde->dentry = dget(dn);
> +
>  		/* inode */
>  		if (d_really_is_positive(dn)) {
>  			in = d_inode(dn);
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index f0d2442187a3..9de749dd715c 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -733,6 +733,8 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
>  
>  		kfree(rde->inode.fscrypt_auth);
>  		kfree(rde->inode.fscrypt_file);
> +		dput(rde->dentry);
> +		rde->dentry = NULL;
>  	}
>  	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
>  }
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 0dfe24f94567..663d7754d57d 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -96,6 +96,7 @@ struct ceph_mds_reply_info_in {
>  };
>  
>  struct ceph_mds_reply_dir_entry {
> +	struct dentry		      *dentry;
>  	char                          *name;
>  	u8			      *altname;
>  	u32                           name_len;
> -- 
>
> 2.27.0
>