On Wed, Nov 22, 2023 at 01:44:39PM +0100, Christian Brauner wrote: > There's no reason we need to couple mnt idmapping to namespaces in the > way we currently do. Copy the idmapping when an idmapped mount is > created and don't take any reference on the namespace at all. > > We also can't easily refcount struct uid_gid_map because it needs to > stay the size of a cacheline otherwise we risk performance regressions > (Ignoring for a second that right now struct uid_gid_map isn't actually > 64 byte but 72 but that's a fix for another patch series.). > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > fs/mnt_idmapping.c | 106 +++++++++++++++++++++++++++++++++++++++++------- > include/linux/uidgid.h | 13 ++++++ > kernel/user_namespace.c | 4 +- > 3 files changed, 106 insertions(+), 17 deletions(-) > > diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c > index 35d78cb3c38a..64c5205e2b5e 100644 > --- a/fs/mnt_idmapping.c > +++ b/fs/mnt_idmapping.c > @@ -9,8 +9,16 @@ > > #include "internal.h" > > +/* > + * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, > + * never from raw values. These are just internal helpers. > + */ > +#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } > +#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } > + > struct mnt_idmap { > - struct user_namespace *owner; > + struct uid_gid_map uid_map; > + struct uid_gid_map gid_map; > refcount_t count; > }; > > @@ -20,7 +28,6 @@ struct mnt_idmap { > * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. > */ > struct mnt_idmap nop_mnt_idmap = { > - .owner = &init_user_ns, > .count = REFCOUNT_INIT(1), > }; > EXPORT_SYMBOL_GPL(nop_mnt_idmap); > @@ -65,7 +72,6 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, > kuid_t kuid) > { > uid_t uid; > - struct user_namespace *mnt_userns = idmap->owner; > > if (idmap == &nop_mnt_idmap) > return VFSUIDT_INIT(kuid); > @@ -75,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, > uid = from_kuid(fs_userns, kuid); > if (uid == (uid_t)-1) > return INVALID_VFSUID; > - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); > + return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); > } > EXPORT_SYMBOL_GPL(make_vfsuid); > > @@ -103,7 +109,6 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, > struct user_namespace *fs_userns, kgid_t kgid) > { > gid_t gid; > - struct user_namespace *mnt_userns = idmap->owner; > > if (idmap == &nop_mnt_idmap) > return VFSGIDT_INIT(kgid); > @@ -113,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, > gid = from_kgid(fs_userns, kgid); > if (gid == (gid_t)-1) > return INVALID_VFSGID; > - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); > + return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); > } > EXPORT_SYMBOL_GPL(make_vfsgid); > > @@ -132,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, > struct user_namespace *fs_userns, vfsuid_t vfsuid) > { > uid_t uid; > - struct user_namespace *mnt_userns = idmap->owner; > > if (idmap == &nop_mnt_idmap) > return AS_KUIDT(vfsuid); > - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); > + uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); > if (uid == (uid_t)-1) > return INVALID_UID; > if (initial_idmapping(fs_userns)) > @@ -160,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, > struct user_namespace *fs_userns, vfsgid_t vfsgid) > { > gid_t gid; > - struct user_namespace *mnt_userns = idmap->owner; > > if (idmap == &nop_mnt_idmap) > return AS_KGIDT(vfsgid); > - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); > + gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); > if (gid == (gid_t)-1) > return INVALID_GID; > if (initial_idmapping(fs_userns)) > @@ -195,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid) > #endif > EXPORT_SYMBOL_GPL(vfsgid_in_group_p); > > +static int copy_mnt_idmap(struct uid_gid_map *map_from, > + struct uid_gid_map *map_to) > +{ > + struct uid_gid_extent *forward, *reverse; > + u32 nr_extents = READ_ONCE(map_from->nr_extents); > + /* Pairs with smp_wmb() when writing the idmapping. */ > + smp_rmb(); > + > + /* > + * Don't blindly copy @map_to into @map_from if nr_extents is > + * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we > + * read @nr_extents someone could have written an idmapping and > + * then we might end up with inconsistent data. So just don't do > + * anything at all. > + */ > + if (nr_extents == 0) > + return 0; > + > + /* > + * Here we know that nr_extents is greater than zero which means > + * a map has been written. Since idmappings can't be changed > + * once they have been written we know that we can safely copy > + * from @map_to into @map_from. > + */ > + > + if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { > + *map_to = *map_from; > + return 0; > + } > + > + forward = kmemdup(map_from->forward, > + nr_extents * sizeof(struct uid_gid_extent), > + GFP_KERNEL_ACCOUNT); > + if (!forward) > + return -ENOMEM; > + > + reverse = kmemdup(map_from->reverse, > + nr_extents * sizeof(struct uid_gid_extent), > + GFP_KERNEL_ACCOUNT); > + if (!reverse) { > + kfree(forward); > + return -ENOMEM; > + } > + > + /* > + * The idmapping isn't exposed anywhere so we don't need to care > + * about ordering between extent pointers and @nr_extents > + * initialization. > + */ > + map_to->forward = forward; > + map_to->reverse = reverse; > + map_to->nr_extents = nr_extents; > + return 0; > +} > + > +static void free_mnt_idmap(struct mnt_idmap *idmap) > +{ > + if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { > + kfree(idmap->uid_map.forward); > + kfree(idmap->uid_map.reverse); > + } > + if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { > + kfree(idmap->gid_map.forward); > + kfree(idmap->gid_map.reverse); > + } > + kfree(idmap); > +} > + > struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) > { > struct mnt_idmap *idmap; > + int ret; > > idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); > if (!idmap) > return ERR_PTR(-ENOMEM); > > - idmap->owner = get_user_ns(mnt_userns); > refcount_set(&idmap->count, 1); > + ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); > + if (!ret) > + ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); > + if (ret) { > + free_mnt_idmap(idmap); > + idmap = ERR_PTR(ret); > + } > return idmap; > } > > @@ -234,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get); > */ > void mnt_idmap_put(struct mnt_idmap *idmap) > { > - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { > - put_user_ns(idmap->owner); > - kfree(idmap); > - } > + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) > + free_mnt_idmap(idmap); > } > EXPORT_SYMBOL_GPL(mnt_idmap_put); > diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h > index b0542cd11aeb..7806e93b907d 100644 > --- a/include/linux/uidgid.h > +++ b/include/linux/uidgid.h > @@ -17,6 +17,7 @@ > > struct user_namespace; > extern struct user_namespace init_user_ns; > +struct uid_gid_map; > > typedef struct { > uid_t val; > @@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) > return from_kgid(ns, gid) != (gid_t) -1; > } > > +u32 map_id_down(struct uid_gid_map *map, u32 id); > +u32 map_id_up(struct uid_gid_map *map, u32 id); > + > #else > > static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) > @@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) > return gid_valid(gid); > } > > +static inline u32 map_id_down(struct uid_gid_map *map, u32 id) > +{ > + return id; > +} > + > +static inline u32 map_id_up(struct uid_gid_map *map, u32 id); You accidentally put a ; here, and then fix it up in the next patch, it needs to be fixed here. Thanks, Josef