[PATCH RFC v3 17/17] fuse: {uring} Pin the user buffer

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is to allow copying into the buffer from the application
without the need to copy in ring context (and with that,
the need that the ring task is active in kernel space).

Also absolutely needed for now to avoid this teardown issue

 1525.905504] KASAN: null-ptr-deref in range [0x00000000000001a0-0x00000000000001a7]
[ 1525.910431] CPU: 15 PID: 183 Comm: kworker/15:1 Tainted: G           O       6.10.0+ #48
[ 1525.916449] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 1525.922470] Workqueue: events io_fallback_req_func
[ 1525.925840] RIP: 0010:__lock_acquire+0x74/0x7b80
[ 1525.929010] Code: 89 bc 24 80 00 00 00 0f 85 1c 5f 00 00 83 3d 6e 80 b0 02 00 0f 84 1d 12 00 00 83 3d 65 c7 67 02 00 74 27 48 89 f8 48 c1 e8 03 <42> 80 3c 30 00 74 0d e8 50 44 42 00 48 8b bc 24 80 00 00 00 48 c7
[ 1525.942211] RSP: 0018:ffff88810b2af490 EFLAGS: 00010002
[ 1525.945672] RAX: 0000000000000034 RBX: 0000000000000000 RCX: 0000000000000001
[ 1525.950421] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000001a0
[ 1525.955200] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
[ 1525.959979] R10: dffffc0000000000 R11: fffffbfff07b1cbe R12: 0000000000000000
[ 1525.964252] R13: 0000000000000001 R14: dffffc0000000000 R15: 0000000000000001
[ 1525.968225] FS:  0000000000000000(0000) GS:ffff88875b200000(0000) knlGS:0000000000000000
[ 1525.973932] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1525.976694] CR2: 00005555b6a381f0 CR3: 000000012f5f1000 CR4: 00000000000006f0
[ 1525.980030] Call Trace:
[ 1525.981371]  <TASK>
[ 1525.982567]  ? __die_body+0x66/0xb0
[ 1525.984376]  ? die_addr+0xc1/0x100
[ 1525.986111]  ? exc_general_protection+0x1c6/0x330
[ 1525.988401]  ? asm_exc_general_protection+0x22/0x30
[ 1525.990864]  ? __lock_acquire+0x74/0x7b80
[ 1525.992901]  ? mark_lock+0x9f/0x360
[ 1525.994635]  ? __lock_acquire+0x1420/0x7b80
[ 1525.996629]  ? attach_entity_load_avg+0x47d/0x550
[ 1525.998765]  ? hlock_conflict+0x5a/0x1f0
[ 1526.000515]  ? __bfs+0x2dc/0x5a0
[ 1526.001993]  lock_acquire+0x1fb/0x3d0
[ 1526.004727]  ? gup_fast_fallback+0x13f/0x1d80
[ 1526.006586]  ? gup_fast_fallback+0x13f/0x1d80
[ 1526.008412]  gup_fast_fallback+0x158/0x1d80
[ 1526.010170]  ? gup_fast_fallback+0x13f/0x1d80
[ 1526.011999]  ? __lock_acquire+0x2b07/0x7b80
[ 1526.013793]  __iov_iter_get_pages_alloc+0x36e/0x980
[ 1526.015876]  ? do_raw_spin_unlock+0x5a/0x8a0
[ 1526.017734]  iov_iter_get_pages2+0x56/0x70
[ 1526.019491]  fuse_copy_fill+0x48e/0x980 [fuse]
[ 1526.021400]  fuse_copy_args+0x174/0x6a0 [fuse]
[ 1526.023199]  fuse_uring_prepare_send+0x319/0x6c0 [fuse]
[ 1526.025178]  fuse_uring_send_req_in_task+0x42/0x100 [fuse]
[ 1526.027163]  io_fallback_req_func+0xb4/0x170
[ 1526.028737]  ? process_scheduled_works+0x75b/0x1160
[ 1526.030445]  process_scheduled_works+0x85c/0x1160
[ 1526.032073]  worker_thread+0x8ba/0xce0
[ 1526.033388]  kthread+0x23e/0x2b0
[ 1526.035404]  ? pr_cont_work_flush+0x290/0x290
[ 1526.036958]  ? kthread_blkcg+0xa0/0xa0
[ 1526.038321]  ret_from_fork+0x30/0x60
[ 1526.039600]  ? kthread_blkcg+0xa0/0xa0
[ 1526.040942]  ret_from_fork_asm+0x11/0x20
[ 1526.042353]  </TASK>

Signed-off-by: Bernd Schubert <bschubert@xxxxxxx>
---
 fs/fuse/dev.c         |   9 +++
 fs/fuse/dev_uring.c   | 186 ++++++++++++++++++++++++++++++++------------------
 fs/fuse/dev_uring_i.h |  15 ++--
 fs/fuse/fuse_dev_i.h  |   2 +
 4 files changed, 143 insertions(+), 69 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9f0f2120b1fa..492bb95fde4e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -769,6 +769,15 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 			cs->pipebufs++;
 			cs->nr_segs++;
 		}
+	} else if (cs->is_uring) {
+		cs->pg = cs->ring.pages[cs->ring.page_idx++];
+		/*
+		 * non stricly needed, just to avoid a uring exception in
+		 * fuse_copy_finish
+		 */
+		get_page(cs->pg);
+		cs->len = PAGE_SIZE;
+		cs->offset = 0;
 	} else {
 		size_t off;
 		err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index a65c5d08fce1..4cc0facaaae3 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -29,6 +29,9 @@
 #include <linux/topology.h>
 #include <linux/io_uring/cmd.h>
 
+#define FUSE_RING_HEADER_PG 0
+#define FUSE_RING_PAYLOAD_PG 1
+
 struct fuse_uring_cmd_pdu {
 	struct fuse_ring_ent *ring_ent;
 };
@@ -250,6 +253,21 @@ static void fuse_uring_stop_fuse_req_end(struct fuse_ring_ent *ent)
 	fuse_request_end(req);
 }
 
+/*
+ * Copy from memmap.c, should be exported
+ */
+static void io_pages_free(struct page ***pages, int npages)
+{
+	struct page **page_array = *pages;
+
+	if (!page_array)
+		return;
+
+	unpin_user_pages(page_array, npages);
+	kvfree(page_array);
+	*pages = NULL;
+}
+
 /*
  * Release a request/entry on connection tear down
  */
@@ -275,6 +293,8 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent,
 	if (ent->fuse_req)
 		fuse_uring_stop_fuse_req_end(ent);
 
+	io_pages_free(&ent->user_pages, ent->nr_user_pages);
+
 	ent->state = FRRS_FREED;
 }
 
@@ -417,6 +437,7 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
 		goto seterr;
 	}
 
+	/* FIXME copied from dev.c, check what 512 means  */
 	if (oh->error <= -512 || oh->error > 0) {
 		err = -EINVAL;
 		goto seterr;
@@ -465,53 +486,41 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
 
 static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
 				     struct fuse_req *req,
-				     struct fuse_ring_ent *ent)
+				     struct fuse_ring_ent *ent,
+				     struct fuse_ring_req *rreq)
 {
-	struct fuse_ring_req __user *rreq = ent->rreq;
 	struct fuse_copy_state cs;
 	struct fuse_args *args = req->args;
 	struct iov_iter iter;
-	int err;
-	int res_arg_len;
+	int res_arg_len, err;
 
-	err = copy_from_user(&res_arg_len, &rreq->in_out_arg_len,
-			     sizeof(res_arg_len));
-	if (err)
-		return err;
-
-	err = import_ubuf(ITER_SOURCE, (void __user *)&rreq->in_out_arg,
-			  ent->max_arg_len, &iter);
-	if (err)
-		return err;
+	res_arg_len = rreq->in_out_arg_len;
 
 	fuse_copy_init(&cs, 0, &iter);
 	cs.is_uring = 1;
+	cs.ring.pages = &ent->user_pages[FUSE_RING_PAYLOAD_PG];
 	cs.req = req;
 
-	return fuse_copy_out_args(&cs, args, res_arg_len);
+	err = fuse_copy_out_args(&cs, args, res_arg_len);
+
+	return err;
 }
 
- /*
-  * Copy data from the req to the ring buffer
-  */
+/*
+ * Copy data from the req to the ring buffer
+ */
 static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
-				   struct fuse_ring_ent *ent)
+				   struct fuse_ring_ent *ent,
+				   struct fuse_ring_req *rreq)
 {
-	struct fuse_ring_req __user *rreq = ent->rreq;
 	struct fuse_copy_state cs;
 	struct fuse_args *args = req->args;
-	int err, res;
+	int err;
 	struct iov_iter iter;
 
-	err = import_ubuf(ITER_DEST, (void __user *)&rreq->in_out_arg,
-			  ent->max_arg_len, &iter);
-	if (err) {
-		pr_info("Import user buffer failed\n");
-		return err;
-	}
-
 	fuse_copy_init(&cs, 1, &iter);
 	cs.is_uring = 1;
+	cs.ring.pages = &ent->user_pages[FUSE_RING_PAYLOAD_PG];
 	cs.req = req;
 	err = fuse_copy_args(&cs, args->in_numargs, args->in_pages,
 			     (struct fuse_arg *)args->in_args, 0);
@@ -520,10 +529,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
 		return err;
 	}
 
-	BUILD_BUG_ON((sizeof(rreq->in_out_arg_len) != sizeof(cs.ring.offset)));
-	res = copy_to_user(&rreq->in_out_arg_len, &cs.ring.offset,
-			   sizeof(rreq->in_out_arg_len));
-	err = res > 0 ? -EFAULT : res;
+	rreq->in_out_arg_len = cs.ring.offset;
 
 	return err;
 }
@@ -531,11 +537,11 @@ static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
 static int
 fuse_uring_prepare_send(struct fuse_ring_ent *ring_ent)
 {
-	struct fuse_ring_req *rreq = ring_ent->rreq;
+	struct fuse_ring_req *rreq = NULL;
 	struct fuse_ring_queue *queue = ring_ent->queue;
 	struct fuse_ring *ring = queue->ring;
 	struct fuse_req *req = ring_ent->fuse_req;
-	int err = 0, res;
+	int err = 0;
 
 	if (WARN_ON(ring_ent->state != FRRS_FUSE_REQ)) {
 		pr_err("qid=%d tag=%d ring-req=%p buf_req=%p invalid state %d on send\n",
@@ -551,25 +557,27 @@ fuse_uring_prepare_send(struct fuse_ring_ent *ring_ent)
 		 __func__, queue->qid, ring_ent->tag, ring_ent->state,
 		 req->in.h.opcode, req->in.h.unique);
 
+	rreq = kmap_local_page(ring_ent->user_pages[FUSE_RING_HEADER_PG]);
+
 	/* copy the request */
-	err = fuse_uring_copy_to_ring(ring, req, ring_ent);
+	err = fuse_uring_copy_to_ring(ring, req, ring_ent, rreq);
 	if (unlikely(err)) {
 		pr_info("Copy to ring failed: %d\n", err);
 		goto err;
 	}
 
 	/* copy fuse_in_header */
-	res = copy_to_user(&rreq->in, &req->in.h, sizeof(rreq->in));
-	err = res > 0 ? -EFAULT : res;
-	if (err)
-		goto err;
+	rreq->in = req->in.h;
 
+	err = 0;
 	set_bit(FR_SENT, &req->flags);
-	return 0;
-
+out:
+	if (rreq)
+		kunmap_local(rreq);
+	return err;
 err:
 	fuse_uring_req_end(ring_ent, true, err);
-	return err;
+	goto out;
 }
 
 /*
@@ -682,16 +690,13 @@ static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
 {
 	struct fuse_ring *ring = ring_ent->queue->ring;
 	struct fuse_conn *fc = ring->fc;
-	struct fuse_ring_req *rreq = ring_ent->rreq;
+	struct fuse_ring_req *rreq;
 	struct fuse_req *req = ring_ent->fuse_req;
 	ssize_t err = 0;
 	bool set_err = false;
 
-	err = copy_from_user(&req->out.h, &rreq->out, sizeof(req->out.h));
-	if (err) {
-		req->out.h.error = err;
-		goto out;
-	}
+	rreq = kmap_local_page(ring_ent->user_pages[FUSE_RING_HEADER_PG]);
+	req->out.h = rreq->out;
 
 	err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
 	if (err) {
@@ -701,7 +706,8 @@ static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
 		goto out;
 	}
 
-	err = fuse_uring_copy_from_ring(ring, req, ring_ent);
+	err = fuse_uring_copy_from_ring(ring, req, ring_ent, rreq);
+	kunmap_local(rreq);
 	if (err)
 		set_err = true;
 
@@ -830,6 +836,46 @@ __must_hold(ring_ent->queue->lock)
 	return 0;
 }
 
+/*
+ * Copy from memmap.c, should be exported there
+ */
+static struct page **io_pin_pages(unsigned long uaddr, unsigned long len,
+				  int *npages)
+{
+	unsigned long start, end, nr_pages;
+	struct page **pages;
+	int ret;
+
+	end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = uaddr >> PAGE_SHIFT;
+	nr_pages = end - start;
+	if (WARN_ON_ONCE(!nr_pages))
+		return ERR_PTR(-EINVAL);
+
+	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+					pages);
+	/* success, mapped all pages */
+	if (ret == nr_pages) {
+		*npages = nr_pages;
+		return pages;
+	}
+
+	/* partial map, or didn't map anything */
+	if (ret >= 0) {
+		/* if we did partial map, release any pages we did get */
+		if (ret)
+			unpin_user_pages(pages, ret);
+		ret = -EFAULT;
+	}
+	kvfree(pages);
+	return ERR_PTR(ret);
+}
+
+
 /* FUSE_URING_REQ_FETCH handler */
 static int fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
 			    struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -837,39 +883,48 @@ static int fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
 {
 	struct fuse_ring *ring = ring_ent->queue->ring;
 	struct fuse_ring_queue *queue = ring_ent->queue;
-	int ret;
+	int err;
 
 	/* No other bit must be set here */
-	ret = -EINVAL;
+	err = -EINVAL;
 	if (ring_ent->state != FRRS_INIT)
-		goto err;
+		goto err_unlock;
 
 	/*
 	 * FUSE_URING_REQ_FETCH is an initialization exception, needs
 	 * state override
 	 */
 	ring_ent->state = FRRS_USERSPACE;
-	ret = fuse_ring_ring_ent_unset_userspace(ring_ent);
-	if (ret != 0) {
-		pr_info_ratelimited(
-			"qid=%d tag=%d register req state %d expected %d",
-			queue->qid, ring_ent->tag, ring_ent->state,
-			FRRS_INIT);
+	fuse_ring_ring_ent_unset_userspace(ring_ent);
+
+	err = _fuse_uring_fetch(ring_ent, cmd, issue_flags);
+	if (err)
+		goto err_unlock;
+
+	spin_unlock(&queue->lock);
+
+	/* must not hold the queue->lock */
+	ring_ent->user_pages = io_pin_pages(ring_ent->user_buf,
+					    ring_ent->user_buf_len,
+					    &ring_ent->nr_user_pages);
+	if (IS_ERR(ring_ent->user_pages)) {
+		err = PTR_ERR(ring_ent->user_pages);
+		pr_info("qid=%d ent=%d pin-res=%d\n",
+			queue->qid, ring_ent->tag, err);
 		goto err;
 	}
 
-	ret = _fuse_uring_fetch(ring_ent, cmd, issue_flags);
-	if (ret)
-		goto err;
-
 	/*
 	 * The ring entry is registered now and needs to be handled
 	 * for shutdown.
 	 */
 	atomic_inc(&ring->queue_refs);
-err:
+	return 0;
+
+err_unlock:
 	spin_unlock(&queue->lock);
-	return ret;
+err:
+	return err;
 }
 
 /**
@@ -920,7 +975,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	if (unlikely(fc->aborted || queue->stopped))
 		goto err_unlock;
 
-	ring_ent->rreq = (void __user *)cmd_req->buf_ptr;
+	ring_ent->user_buf = cmd_req->buf_ptr;
+	ring_ent->user_buf_len = cmd_req->buf_len;
+
 	ring_ent->max_arg_len = cmd_req->buf_len -
 				offsetof(struct fuse_ring_req, in_out_arg);
 	ret = -EINVAL;
@@ -930,7 +987,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 		goto err_unlock;
 	}
 
-	ring_ent->rreq = (void __user *)cmd_req->buf_ptr;
 	ring_ent->max_arg_len = cmd_req->buf_len -
 				offsetof(struct fuse_ring_req, in_out_arg);
 	if (cmd_req->buf_len < ring->req_buf_sz) {
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index f1247ee57dc4..2e43b2e9bcf2 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -60,10 +60,17 @@ struct fuse_ring_ent {
 	/* fuse_req assigned to the ring entry */
 	struct fuse_req *fuse_req;
 
-	/*
-	 * buffer provided by fuse server
-	 */
-	struct fuse_ring_req __user *rreq;
+	/* buffer provided by fuse server */
+	unsigned long __user user_buf;
+
+	/* length of user_buf */
+	size_t user_buf_len;
+
+	/* mapped user_buf pages */
+	struct page **user_pages;
+
+	/* number of user pages */
+	int nr_user_pages;
 
 	/* struct fuse_ring_req::in_out_arg size*/
 	size_t max_arg_len;
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 0fbb4f28261c..63e0e5dcb9f4 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -32,6 +32,8 @@ struct fuse_copy_state {
 	struct {
 		/* overall offset with the user buffer */
 		unsigned int offset;
+		struct page **pages;
+		int page_idx;
 	} ring;
 };
 

-- 
2.43.0





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux