Re: [PATCH v9 3/7] ovl: constant st_ino for non-samefs with xino

Amir Goldstein <amir73il@xxxxxxxxx> · Thu, 29 Mar 2018 19:42:44 +0300

On Thu, Mar 29, 2018 at 6:58 PM, Miklos Szeredi <miklos@xxxxxxxxxx> wrote:
> On Thu, Mar 29, 2018 at 4:18 PM, Amir Goldstein <amir73il@xxxxxxxxx> wrote:
>> On 64bit systems, when overlay layers are not all on the same fs, but
>> all inode numbers of underlying fs are not using the high bits, use the
>> high bits to partition the overlay st_ino address space.  The high bits
>> hold the fsid (upper fsid is 0).  This way overlay inode numbers are unique
>> and all inodes use overlay st_dev.  Inode numbers are also persistent
>> for a given layer configuration.
>>
>> Currently, our only indication for available high ino bits is from a
>> filesystem that supports file handles and uses the default encode_fh()
>> operation, which encodes a 32bit inode number.
>>
>> Signed-off-by: Amir Goldstein <amir73il@xxxxxxxxx>
>> ---
>>  fs/overlayfs/inode.c     | 31 +++++++++++++++++++++++++++++--
>>  fs/overlayfs/overlayfs.h |  3 ++-
>>  fs/overlayfs/ovl_entry.h |  2 ++
>>  fs/overlayfs/super.c     | 26 ++++++++++++++++++++++----
>>  fs/overlayfs/util.c      | 24 +++++++++++++++++++++---
>>  5 files changed, 76 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
>> index 89dfab20fe0e..7fc9c83bf2ff 100644
>> --- a/fs/overlayfs/inode.c
>> +++ b/fs/overlayfs/inode.c
>> @@ -63,6 +63,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat)
>>  {
>>         struct ovl_layer *lower_layer = ovl_layer_lower(dentry);
>>         bool samefs = ovl_same_sb(dentry->d_sb);
>> +       int xinobits = ovl_xino_bits(dentry->d_sb);
>>
>>         if (samefs) {
>>                 /*
>> @@ -71,7 +72,31 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat)
>>                  * which is friendly to du -x.
>>                  */
>>                 stat->dev = dentry->d_sb->s_dev;
>> -       } else if (S_ISDIR(dentry->d_inode->i_mode)) {
>> +               return 0;
>> +       } else if (xinobits) {
>> +               /*
>> +                * All inode numbers of underlying fs should not be using the
>> +                * high xinobits, so we use high xinobits to partition the
>> +                * overlay st_ino address space. The high bits holds the fsid
>> +                * (upper fsid is 0). This way overlay inode numbers are unique
>> +                * and all inodes use overlay st_dev. Inode numbers are also
>> +                * persistent for a given layer configuration.
>> +                */
>> +               if (stat->ino >> (64 - xinobits)) {
>> +                       pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
>> +                                           dentry, stat->ino, xinobits);
>> +               } else {
>> +                       if (lower_layer) {
>> +                               stat->ino |= ((u64)lower_layer->fsid)
>> +                                            << (64 - xinobits);
>> +                       }
>> +                       stat->dev = dentry->d_sb->s_dev;
>> +                       return 0;
>> +               }
>> +       }
>> +
>> +       /* The inode could not be mapped to a unified st_ino address space */
>> +       if (S_ISDIR(dentry->d_inode->i_mode)) {
>>                 /*
>>                  * Always use the overlay st_dev for directories, so 'find
>>                  * -xdev' will scan the entire overlay mount and won't cross the
>> @@ -117,11 +142,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
>>         /*
>>          * For non-dir or same fs, we use st_ino of the copy up origin.
>>          * This guaranties constant st_dev/st_ino across copy up.
>> +        * With xino feature and non-samefs, we use st_ino of the copy up
>> +        * origin masked with high bits that represent the layer id.
>>          *
>>          * If lower filesystem supports NFS file handles, this also guaranties
>>          * persistent st_ino across mount cycle.
>>          */
>> -       if (!is_dir || samefs) {
>> +       if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
>>                 if (OVL_TYPE_ORIGIN(type)) {
>>                         struct kstat lowerstat;
>>                         u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
>> diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
>> index 4432599ecbc6..09f5b8ce70aa 100644
>> --- a/fs/overlayfs/overlayfs.h
>> +++ b/fs/overlayfs/overlayfs.h
>> @@ -202,7 +202,8 @@ void ovl_drop_write(struct dentry *dentry);
>>  struct dentry *ovl_workdir(struct dentry *dentry);
>>  const struct cred *ovl_override_creds(struct super_block *sb);
>>  struct super_block *ovl_same_sb(struct super_block *sb);
>> -bool ovl_can_decode_fh(struct super_block *sb);
>> +int ovl_xino_bits(struct super_block *sb);
>> +int ovl_can_decode_fh(struct super_block *sb);
>>  struct dentry *ovl_indexdir(struct super_block *sb);
>>  bool ovl_index_all(struct super_block *sb);
>>  bool ovl_verify_lower(struct super_block *sb);
>> diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
>> index e1c838c27a74..6a077fb2a75f 100644
>> --- a/fs/overlayfs/ovl_entry.h
>> +++ b/fs/overlayfs/ovl_entry.h
>> @@ -63,6 +63,8 @@ struct ovl_fs {
>>         /* Did we take the inuse lock? */
>>         bool upperdir_locked;
>>         bool workdir_locked;
>> +       /* Inode numbers in all layers do not use the high xino_bits */
>> +       int xino_bits;
>>  };
>>
>>  /* private information held for every overlayfs dentry */
>> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
>> index 7d97d30cad39..d7284444f404 100644
>> --- a/fs/overlayfs/super.c
>> +++ b/fs/overlayfs/super.c
>> @@ -17,6 +17,7 @@
>>  #include <linux/statfs.h>
>>  #include <linux/seq_file.h>
>>  #include <linux/posix_acl_xattr.h>
>> +#include <linux/exportfs.h>
>>  #include "overlayfs.h"
>>
>>  MODULE_AUTHOR("Miklos Szeredi <miklos@xxxxxxxxxx>");
>> @@ -701,6 +702,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
>>  static int ovl_lower_dir(const char *name, struct path *path,
>>                          struct ovl_fs *ofs, int *stack_depth, bool *remote)
>>  {
>> +       int fh_type;
>>         int err;
>>
>>         err = ovl_mount_dir_noesc(name, path);
>> @@ -720,15 +722,19 @@ static int ovl_lower_dir(const char *name, struct path *path,
>>          * The inodes index feature and NFS export need to encode and decode
>>          * file handles, so they require that all layers support them.
>>          */
>> +       fh_type = ovl_can_decode_fh(path->dentry->d_sb);
>>         if ((ofs->config.nfs_export ||
>> -            (ofs->config.index && ofs->config.upperdir)) &&
>> -           !ovl_can_decode_fh(path->dentry->d_sb)) {
>> +            (ofs->config.index && ofs->config.upperdir)) && !fh_type) {
>>                 ofs->config.index = false;
>>                 ofs->config.nfs_export = false;
>>                 pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
>>                         name);
>>         }
>>
>> +       /* Check if lower fs has 32bit inode numbers */
>> +       if (fh_type != FILEID_INO32_GEN)
>> +               ofs->xino_bits = 0;
>> +
>>         return 0;
>>
>>  out_put:
>> @@ -952,6 +958,7 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
>>  {
>>         struct vfsmount *mnt = ofs->upper_mnt;
>>         struct dentry *temp;
>> +       int fh_type;
>>         int err;
>>
>>         err = mnt_want_write(mnt);
>> @@ -1001,12 +1008,16 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
>>         }
>>
>>         /* Check if upper/work fs supports file handles */
>> -       if (ofs->config.index &&
>> -           !ovl_can_decode_fh(ofs->workdir->d_sb)) {
>> +       fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
>> +       if (ofs->config.index && !fh_type) {
>>                 ofs->config.index = false;
>>                 pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
>>         }
>>
>> +       /* Check if upper fs has 32bit inode numbers */
>> +       if (fh_type != FILEID_INO32_GEN)
>> +               ofs->xino_bits = 0;
>> +
>>         /* NFS export of r/w mount depends on index */
>>         if (ofs->config.nfs_export && !ofs->config.index) {
>>                 pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n");
>> @@ -1185,6 +1196,11 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
>>                 }
>>                 ofs->numlower++;
>>         }
>> +
>> +       /* When all layers on same fs, overlay can use real inode numbers */
>> +       if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt))
>> +               ofs->xino_bits = 0;
>> +
>>         err = 0;
>>  out:
>>         return err;
>> @@ -1308,6 +1324,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
>>
>>         sb->s_stack_depth = 0;
>>         sb->s_maxbytes = MAX_LFS_FILESIZE;
>> +       /* Assume underlaying fs uses 32bit inodes unless proven otherwise */
>> +       ofs->xino_bits = BITS_PER_LONG - 32;
>
> This disables xino for 32bit archs.  Which is probably the right thing
> to do, otherwise there might be a regression in some cases since
> kernel will return EOVERFLOW if st_ino would overflow. Well, this is
> true for 32bit mode in 64bit kernel as well, so the above is not a
> perfect solution.
>
> Not sure if we need to worry.  For 32bit archs, I think disabling xino
> is OK; it can be enabled explicitly if needed.  For 64bit archs, let's
> hope it doesn't regress for anybody and if it does, we need to take
> steps.

Perhaps, the steps would be to implement -o noxino.

>
> So if you agree, I'll just add these as a comment.
>

Ok.

Thanks,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-unionfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html