[PATCH bpf-next 01/11] bpf: make bpf_d_path() helper use probe-read semantics

Matt Bobrowski <mattbobrowski@xxxxxxxxxx> · Tue, 20 Feb 2024 09:27:23 +0000

There has now been several reported instances [0, 1, 2] where the
usage of the BPF helper bpf_d_path() has led to some form of memory
corruption issue.

The fundamental reason behind why we repeatedly see bpf_d_path() being
susceptible to such memory corruption issues is because it only
enforces ARG_PTR_TO_BTF_ID constraints onto it's struct path
argument. This essentially means that it only requires an in-kernel
pointer of type struct path to be provided to it. Depending on the
underlying context and where the supplied struct path was obtained
from and when, depends on whether the struct path is fully intact or
not when calling bpf_d_path(). It's certainly possible to call
bpf_d_path() and subsequently d_path() from contexts where the
supplied struct path to bpf_d_path() has already started being torn
down by __fput() and such. An example of this is perfectly illustrated
in [0].

Moving forward, we simply cannot enforce KF_TRUSTED_ARGS semantics
onto struct path of bpf_d_path(), as this approach would presumably
lead to some pretty wide scale and highly undesirable BPF program
breakage. To avoid breaking any pre-existing BPF program that is
dependent on bpf_d_path(), I propose that we take a different path and
re-implement an incredibly minimalistic and bare bone version of
d_path() which is entirely backed by kernel probe-read semantics. IOW,
a version of d_path() that is backed by
copy_from_kernel_nofault(). This ensures that any reads performed
against the supplied struct path to bpf_d_path() which may end up
faulting for whatever reason end up being gracefully handled and fixed
up.

The caveats with such an approach is that we can't fully uphold all of
d_path()'s path resolution capabilities. Resolving a path which is
comprised of a dentry that make use of dynamic names via isn't
possible as we can't enforce probe-read semantics onto indirect
function calls performed via d_op as they're implementation
dependent. For such cases, we just return -EOPNOTSUPP. This might be a
little surprising to some users, especially those that are interested
in resolving paths that involve a dentry that resides on some
non-mountable pseudo-filesystem, being pipefs/sockfs/nsfs, but it's
arguably better than enforcing KF_TRUSTED_ARGS onto bpf_d_path() and
causing an unnecessary shemozzle for users. Additionally, we don't
make use of all the locking semantics, or handle all the erroneous
cases in which d_path() naturally would. This is fine however, as
we're only looking to provide users with a rather acceptable version
of a reconstructed path, whilst they eventually migrate over to the
trusted bpf_path_d_path() BPF kfunc variant.

Note that the selftests that go with this change to bpf_d_path() have
been purposely split out into a completely separate patch. This is so
that the reviewers attention is not torn by noise and can remain
focused on reviewing the implementation details contained within this
patch.

[0] https://lore.kernel.org/bpf/CAG48ez0ppjcT=QxU-jtCUfb5xQb3mLr=5FcwddF_VKfEBPs_Dg@xxxxxxxxxxxxxx/
[1] https://lore.kernel.org/bpf/20230606181714.532998-1-jolsa@xxxxxxxxxx/
[2] https://lore.kernel.org/bpf/20220219113744.1852259-1-memxor@xxxxxxxxx/

Signed-off-by: Matt Bobrowski <mattbobrowski@xxxxxxxxxx>
---
 fs/Makefile                       |   6 +-
 fs/probe_read_d_path.c            | 150 ++++++++++++++++++++++++++++++
 include/linux/probe_read_d_path.h |  13 +++
 kernel/trace/bpf_trace.c          |  13 ++-
 4 files changed, 172 insertions(+), 10 deletions(-)
 create mode 100644 fs/probe_read_d_path.c
 create mode 100644 include/linux/probe_read_d_path.h

diff --git a/fs/Makefile b/fs/Makefile
index c09016257f05..945c9c84d35d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -4,7 +4,7 @@
 #
 # 14 Sep 2000, Christoph Hellwig <hch@xxxxxxxxxxxxx>
 # Rewritten to use lists instead of if-statements.
-# 
+#
 
 
 obj-y :=	open.o read_write.o file_table.o super.o \
@@ -12,7 +12,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		ioctl.o readdir.o select.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
-		pnode.o splice.o sync.o utimes.o d_path.o \
+		pnode.o splice.o sync.o utimes.o d_path.o probe_read_d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o
@@ -58,7 +58,7 @@ obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
 obj-$(CONFIG_DLM)		+= dlm/
- 
+
 # Do not add any filesystems before this line
 obj-$(CONFIG_NETFS_SUPPORT)	+= netfs/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
diff --git a/fs/probe_read_d_path.c b/fs/probe_read_d_path.c
new file mode 100644
index 000000000000..8d0db902f836
--- /dev/null
+++ b/fs/probe_read_d_path.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Google LLC.
+ */
+
+#include "asm/ptrace.h"
+#include <linux/container_of.h>
+#include <linux/dcache.h>
+#include <linux/fs_struct.h>
+#include <linux/uaccess.h>
+#include <linux/path.h>
+#include <linux/probe_read_d_path.h>
+
+#include "mount.h"
+
+#define PROBE_READ(src)                                              \
+	({                                                           \
+		typeof(src) __r;                                     \
+		if (copy_from_kernel_nofault((void *)(&__r), (&src), \
+					     sizeof((__r))))         \
+			memset((void *)(&__r), 0, sizeof((__r)));    \
+		__r;                                                 \
+	})
+
+static inline bool probe_read_d_unlinked(const struct dentry *dentry)
+{
+	return !PROBE_READ(dentry->d_hash.pprev) &&
+	       !(dentry == PROBE_READ(dentry->d_parent));
+}
+
+static long probe_read_prepend(const char *s, int len, char *buf, int *buflen)
+{
+	/*
+	 * The supplied len that is to be copied into the buffer will result in
+	 * an overflow. The true implementation of d_path() already returns an
+	 * error for such overflow cases, so the semantics with regards to the
+	 * bpf_d_path() helper returning the same error value for overflow cases
+	 * remain the same.
+	 */
+	if (len > *buflen)
+		return -ENAMETOOLONG;
+
+	/*
+	 * The supplied string fits completely into the remaining buffer
+	 * space. Attempt to make the copy.
+	 */
+	*buflen -= len;
+	buf += *buflen;
+	return copy_from_kernel_nofault(buf, s, len);
+}
+
+static bool use_dname(const struct path *path)
+{
+	const struct dentry_operations *d_op;
+	char *(*d_dname)(struct dentry *, char *, int);
+
+	d_op = PROBE_READ(path->dentry->d_op);
+	d_dname = PROBE_READ(d_op->d_dname);
+
+	return d_op && d_dname &&
+	       (!(path->dentry == PROBE_READ(path->dentry->d_parent)) ||
+		path->dentry != PROBE_READ(path->mnt->mnt_root));
+}
+
+char *probe_read_d_path(const struct path *path, char *buf, int buflen)
+{
+	int len;
+	long err;
+	struct path root;
+	struct mount *mnt;
+	struct dentry *dentry;
+
+	dentry = path->dentry;
+	mnt = container_of(path->mnt, struct mount, mnt);
+
+	/*
+	 * We cannot back dentry->d_op->d_dname() with probe-read semantics, so
+	 * just return an error to the caller when the supplied path contains a
+	 * dentry component that makes use of a dynamic name.
+	 */
+	if (use_dname(path))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	err = probe_read_prepend("\0", 1, buf, &buflen);
+	if (err)
+		return ERR_PTR(err);
+
+	if (probe_read_d_unlinked(dentry)) {
+		err = probe_read_prepend(" (deleted)", 10, buf, &buflen);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	len = buflen;
+	root = PROBE_READ(current->fs->root);
+	while (dentry != root.dentry || &mnt->mnt != root.mnt) {
+		struct dentry *parent;
+		if (dentry == PROBE_READ(mnt->mnt.mnt_root)) {
+			struct mount *m;
+
+			m = PROBE_READ(mnt->mnt_parent);
+			if (mnt != m) {
+				dentry = PROBE_READ(mnt->mnt_mountpoint);
+				mnt = m;
+				continue;
+			}
+
+			/*
+			 * If we've reached the global root, then there's
+			 * nothing we can really do but bail.
+			 */
+			break;
+		}
+
+		parent = PROBE_READ(dentry->d_parent);
+		if (dentry == parent) {
+			/*
+			 * Escaped? We return an ECANCELED error here to signify
+			 * that we've prematurely terminated pathname
+			 * reconstruction. We've potentially hit a root dentry
+			 * that isn't associated with any roots from the mounted
+			 * filesystems that we've jumped through, so it's not
+			 * clear where we are in the VFS exactly.
+			 */
+			err = -ECANCELED;
+			break;
+		}
+
+		err = probe_read_prepend(dentry->d_name.name,
+					 PROBE_READ(dentry->d_name.len), buf,
+					 &buflen);
+		if (err)
+			break;
+
+		err = probe_read_prepend("/", 1, buf, &buflen);
+		if (err)
+			break;
+		dentry = parent;
+	}
+
+	if (err)
+		return ERR_PTR(err);
+
+	if (len == buflen) {
+		err = probe_read_prepend("/", 1, buf, &buflen);
+		if (err)
+			return ERR_PTR(err);
+	}
+	return buf + buflen;
+}
diff --git a/include/linux/probe_read_d_path.h b/include/linux/probe_read_d_path.h
new file mode 100644
index 000000000000..9b3908746657
--- /dev/null
+++ b/include/linux/probe_read_d_path.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Google LLC.
+ */
+
+#ifndef _LINUX_PROBE_READ_D_PATH_H
+#define _LINUX_PROBE_READ_D_PATH_H
+
+#include <linux/path.h>
+
+extern char *probe_read_d_path(const struct path *path, char *buf, int buflen);
+
+#endif /* _LINUX_PROBE_READ_D_PATH_H */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 241ddf5e3895..12dbd9cef1fa 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -25,6 +25,7 @@
 #include <linux/verification.h>
 #include <linux/namei.h>
 #include <linux/fileattr.h>
+#include <linux/probe_read_d_path.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -923,14 +924,12 @@ BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
 	if (len < 0)
 		return len;
 
-	p = d_path(&copy, buf, sz);
-	if (IS_ERR(p)) {
-		len = PTR_ERR(p);
-	} else {
-		len = buf + sz - p;
-		memmove(buf, p, len);
-	}
+	p = probe_read_d_path(&copy, buf, sz);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
 
+	len = buf + sz - p;
+	memmove(buf, p, len);
 	return len;
 }
 
-- 
2.44.0.rc0.258.g7320e95886-goog

/M