As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has O(N^2) behaviour under contention with N concurrent operations. The rcuref infrastructure uses atomic_add_negative_relaxed() for the fast path, which scales better under contention and we get overflow protection for free. I've been testing this with will-it-scale using fstat() on a machine that Jens gave me access (thank you very much!): processor : 511 vendor_id : AuthenticAMD cpu family : 25 model : 160 model name : AMD EPYC 9754 128-Core Processor and I consistently get a 3-5% improvement on 256+ threads. Files are SLAB_TYPESAFE_BY_RCU and thus don't have "regular" rcu protection. In short, freeing of files isn't delayed until a grace period has elapsed. Instead, they are freed immediately and thus can be reused (multiple times) within the same grace period. So when picking a file from the file descriptor table via its file descriptor number it is thus possible to see an elevated reference count on file->f_count even though the file has already been recycled possibly multiple times by another task. To guard against this the vfs will pick the file from the file descriptor via its file descriptor number twice. Once before the rcuref_long_get() and once after and compare the pointers (grossly simplified). If they match then the file is still valid. If not the caller needs to fput() it. The rcuref infrastructure requires explicit rcu protection to handle the following race: > Deconstruction race > =================== > > The release operation must be protected by prohibiting a grace period in > order to prevent a possible use after free: > > T1 T2 > put() get() > // ref->refcnt = ONEREF > if (!atomic_add_negative(-1, &ref->refcnt)) > return false; <- Not taken > > // ref->refcnt == NOREF > --> preemption > // Elevates ref->refcnt to ONEREF > if (!atomic_add_negative(1, &ref->refcnt)) > return true; <- taken > > if (put(&p->ref)) { <-- Succeeds > remove_pointer(p); > kfree_rcu(p, rcu); > } > > RCU grace period ends, object is freed > > atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF > > [...] it prevents the grace period which keeps the object alive until > all put() operations complete. Having files by SLAB_TYPESAFE_BY_RCU shouldn't cause any problems for the rcuref deconstruction race. Afaict, the only interesting case would be someone freeing the file and someone immediately recycling it within the same grace period and reinitializing file->f_count to ONEREF while a concurrent fput() is doing atomic_cmpxchg(&ref->refcnt, NOREF, DEAD) as in the race above. But this seems safe from SLAB_TYPESAFE_BY_RCU's perspective and it should be safe from rcuref's perspective. T1 T2 T3 fput() fget() // f_count->refcnt = ONEREF if (!atomic_add_negative(-1, &f_count->refcnt)) return false; <- Not taken // f_count->refcnt == NOREF --> preemption // Elevates f_count->refcnt to ONEREF if (!atomic_add_negative(1, &f_count->refcnt)) return true; <- taken if (put(&f_count)) { <-- Succeeds remove_pointer(p); /* * Cache is SLAB_TYPESAFE_BY_RCU * so this is freed without a grace period. */ kmem_cache_free(p); } kmem_cache_alloc() init_file() { // Sets f_count->refcnt to ONEREF rcuref_long_init(&f->f_count, 1); } Object has been reused within the same grace period via kmem_cache_alloc()'s SLAB_TYPESAFE_BY_RCU. /* * With SLAB_TYPESAFE_BY_RCU this would be a safe UAF access and * it would probably work correctly because the atomic_cmpxchg() * will fail because the refcount has been reset to ONEREF by T3. */ atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> --- drivers/gpu/drm/i915/gt/shmem_utils.c | 2 +- drivers/gpu/drm/vmwgfx/ttm_object.c | 2 +- fs/eventpoll.c | 2 +- fs/file.c | 17 ++++++++--------- fs/file_table.c | 7 ++++--- include/linux/fs.h | 9 +++++---- include/linux/rcuref_long.h | 5 +++-- 7 files changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c index 1fb6ff77fd899111a0797dac0edd3f2cfa01f42d..bb696b29ee2c992c6b6d0ec5ae538f9ebbb9ed29 100644 --- a/drivers/gpu/drm/i915/gt/shmem_utils.c +++ b/drivers/gpu/drm/i915/gt/shmem_utils.c @@ -40,7 +40,7 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj) if (i915_gem_object_is_shmem(obj)) { file = obj->base.filp; - atomic_long_inc(&file->f_count); + get_file(file); return file; } diff --git a/drivers/gpu/drm/vmwgfx/ttm_object.c b/drivers/gpu/drm/vmwgfx/ttm_object.c index 3353e97687d1d5d0e05bdc8f26ae4b0aae53a997..539dfec0e623ec2be730924fe7b8e28a2ff1face 100644 --- a/drivers/gpu/drm/vmwgfx/ttm_object.c +++ b/drivers/gpu/drm/vmwgfx/ttm_object.c @@ -471,7 +471,7 @@ void ttm_object_device_release(struct ttm_object_device **p_tdev) */ static bool __must_check get_dma_buf_unless_doomed(struct dma_buf *dmabuf) { - return atomic_long_inc_not_zero(&dmabuf->file->f_count) != 0L; + return rcuref_long_get(&dmabuf->file->f_count); } /** diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd88b07e323d0dd75be6c0fe9fff54f..0a033950225af274c21e503a6ea4813e5bab5dc2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1002,7 +1002,7 @@ static struct file *epi_fget(const struct epitem *epi) struct file *file; file = epi->ffd.file; - if (!atomic_long_inc_not_zero(&file->f_count)) + if (!rcuref_long_get(&file->f_count)) file = NULL; return file; } diff --git a/fs/file.c b/fs/file.c index 5125607d040a2ff073e170d043124db5f444a90a..74e7a1cd709fc2147655d5e4b75cc2d8250bed88 100644 --- a/fs/file.c +++ b/fs/file.c @@ -866,10 +866,10 @@ static struct file *__get_file_rcu(struct file __rcu **f) if (!file) return NULL; - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!rcuref_long_get(&file->f_count))) return ERR_PTR(-EAGAIN); - file_reloaded = rcu_dereference_raw(*f); + file_reloaded = smp_load_acquire(f); /* * Ensure that all accesses have a dependency on the load from @@ -880,8 +880,8 @@ static struct file *__get_file_rcu(struct file __rcu **f) OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* - * atomic_long_inc_not_zero() above provided a full memory - * barrier when we acquired a reference. + * smp_load_acquire() provided an acquire barrier when we loaded + * the file pointer. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still @@ -979,11 +979,10 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * We need to confirm it by incrementing the refcount * and then check the lookup again. * - * atomic_long_inc_not_zero() gives us a full memory - * barrier. We only really need an 'acquire' one to - * protect the loads below, but we don't have that. + * rcuref_long_get() doesn't provide a memory barrier so + * we use smp_load_acquire() on the file pointer below. */ - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!rcuref_long_get(&file->f_count))) continue; /* @@ -1000,7 +999,7 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(file != rcu_dereference_raw(*fdentry)) || + if (unlikely(file != smp_load_acquire(fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; diff --git a/fs/file_table.c b/fs/file_table.c index 9fc9048145ca023ef8af8769d5f1234a69f10df1..f4b96a9dade804a81347865625418a0fdc9a7c09 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -28,6 +28,7 @@ #include <linux/task_work.h> #include <linux/swap.h> #include <linux/kmemleak.h> +#include <linux/rcuref_long.h> #include <linux/atomic.h> @@ -175,7 +176,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ - atomic_long_set(&f->f_count, 1); + rcuref_long_init(&f->f_count, 1); return 0; } @@ -480,7 +481,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { + if (rcuref_long_put_rcusafe(&file->f_count)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { @@ -513,7 +514,7 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) + if (rcuref_long_put_rcusafe(&file->f_count)) __fput(file); } diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337650d562405500013f5c4cfed8eb6..a7831eaf0edd13ebe9765e532602688b317da315 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -45,6 +45,7 @@ #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> +#include <linux/rcuref_long.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -1030,7 +1031,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { - atomic_long_t f_count; + rcuref_long_t f_count; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1078,15 +1079,15 @@ struct file_handle { static inline struct file *get_file(struct file *f) { - long prior = atomic_long_fetch_inc_relaxed(&f->f_count); - WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n"); + WARN_ONCE(!rcuref_long_get(&f->f_count), + "struct file::f_count incremented from zero; use-after-free condition present!\n"); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); -#define file_count(x) atomic_long_read(&(x)->f_count) +#define file_count(x) rcuref_long_read(&(x)->f_count) #define MAX_NON_LFS ((1UL<<31) - 1) diff --git a/include/linux/rcuref_long.h b/include/linux/rcuref_long.h index 7cedc537e5268e114f1a4221a4f1b0cb8d0e1241..10623119bb5038a1b171e31b8fd962a87e3670f5 100644 --- a/include/linux/rcuref_long.h +++ b/include/linux/rcuref_long.h @@ -85,11 +85,12 @@ __must_check bool rcuref_long_put_slowpath(rcuref_long_t *ref); /* * Internal helper. Do not invoke directly. + * + * Ideally we'd RCU_LOCKDEP_WARN() here but we can't since this api is + * used with SLAB_TYPSAFE_BY_RCU. */ static __always_inline __must_check bool __rcuref_long_put(rcuref_long_t *ref) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), - "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. -- 2.45.2