User can specify a smaller meta buffer than what the device is wired to update/access. Kernel makes a copy of the meta buffer into which the device does DMA. As a result, the device overwrites the unrelated kernel memory, causing random kernel crashes. Same issue is possible for extended-lba case also. When user specifies a short unaligned buffer, the kernel makes a copy and uses that for DMA. Detect these situations for sync/uring passthrough, and bail out. Fixes: 456cba386e94 ("nvme: wire-up uring-cmd support for io-passthru on char-device") Cc: stable@xxxxxxxxxxxxxxx Reported-by: Vincent Fu <vincent.fu@xxxxxxxxxxx> Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> --- Changes since v2: - Handle extended-lba case: short unaligned buffer IO - Reduce the scope of check to only well-known commands - Do not check early. Move it deeper so that check gets executed less often - Combine two patches into one. Changes since v1: - Revise the check to exclude PRACT=1 case Random crash example: [ 6815.014478] general protection fault, probably for non-canonical address 0x70e3cdbe9133b7a6: 0000 [#1] PREEMPT SMP PTI [ 6815.014505] CPU: 1 PID: 434 Comm: systemd-timesyn Tainted: G OE 6.4.0-rc3+ #5 [ 6815.014516] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 6815.014522] RIP: 0010:__kmem_cache_alloc_node+0x100/0x440 [ 6815.014551] Code: 48 85 c0 0f 84 fb 02 00 00 41 83 ff ff 74 10 48 8b 00 48 c1 e8 36 41 39 c7 0f 85 e5 02 00 00 41 8b 45 28 49 8b 7d 00 4c 01 e0 <48> 8b 18 48 89 c1 49 33 9d b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb [ 6815.014559] RSP: 0018:ffffb510c0577d18 EFLAGS: 00010216 [ 6815.014569] RAX: 70e3cdbe9133b7a6 RBX: ffff8a9ec1042300 RCX: 0000000000000010 [ 6815.014575] RDX: 00000000048b0001 RSI: 0000000000000dc0 RDI: 0000000000037060 [ 6815.014581] RBP: ffffb510c0577d58 R08: ffffffffb9ffa280 R09: 0000000000000000 [ 6815.014586] R10: ffff8a9ecbcab1f0 R11: 0000000000000000 R12: 70e3cdbe9133b79e [ 6815.014591] R13: ffff8a9ec1042300 R14: 0000000000000dc0 R15: 00000000ffffffff [ 6815.014597] FS: 00007fce590d6940(0000) GS:ffff8a9f3dd00000(0000) knlGS:0000000000000000 [ 6815.014604] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6815.014609] CR2: 00005579abbb6498 CR3: 000000000d9b0000 CR4: 00000000000006e0 [ 6815.014622] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6815.014627] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 6815.014632] Call Trace: [ 6815.014650] <TASK> [ 6815.014655] ? apparmor_sk_alloc_security+0x40/0x80 [ 6815.014673] kmalloc_trace+0x2a/0xa0 [ 6815.014684] apparmor_sk_alloc_security+0x40/0x80 [ 6815.014694] security_sk_alloc+0x3f/0x60 [ 6815.014703] sk_prot_alloc+0x75/0x110 [ 6815.014712] sk_alloc+0x31/0x200 [ 6815.014721] inet_create+0xd8/0x3a0 [ 6815.014734] __sock_create+0x11b/0x220 [ 6815.014749] __sys_socket_create.part.0+0x49/0x70 [ 6815.014756] ? __secure_computing+0x94/0xf0 [ 6815.014768] __sys_socket+0x3c/0xc0 [ 6815.014776] __x64_sys_socket+0x1a/0x30 [ 6815.014783] do_syscall_64+0x3b/0x90 [ 6815.014794] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 6815.014804] RIP: 0033:0x7fce59aa795b drivers/nvme/host/ioctl.c | 76 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index d8ff796fd5f2..8147750beff4 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -158,6 +158,67 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, return req; } +static inline bool nvme_nlb_in_cdw12(u8 opcode) +{ + switch (opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_compare: + case nvme_cmd_zone_append: + return true; + } + return false; +} + +static bool nvme_validate_passthru_meta(struct nvme_ns *ns, + struct nvme_command *c, + __u32 meta_len, + unsigned data_len) +{ + /* + * User may specify smaller meta-buffer with a larger data-buffer. + * Driver allocated meta buffer will also be small. + * Device can do larger dma into that, overwriting unrelated kernel + * memory. + * For extended lba case, if user provides a short unaligned buffer, + * the device will corrupt kernel memory. + * Avoid running into corruption in both situations. + */ + bool ext_lba = ns->features & NVME_NS_EXT_LBAS; + u16 nlb, control; + unsigned dlen, mlen; + + /* Exclude commands that do not have nlb in cdw12 */ + if (!nvme_nlb_in_cdw12(c->common.opcode)) + return true; + + control = upper_16_bits(le32_to_cpu(c->common.cdw12)); + /* Exclude when meta transfer from/to host is not done */ + if (control & NVME_RW_PRINFO_PRACT && ns->ms == ns->pi_size) + return true; + + nlb = lower_16_bits(le32_to_cpu(c->common.cdw12)); + mlen = (nlb + 1) * ns->ms; + + /* sanity for interleaved buffer */ + if (ext_lba) { + dlen = (nlb + 1) << ns->lba_shift; + if (data_len < (dlen + mlen)) + goto out_false; + return true; + } + /* sanity for separate meta buffer */ + if (meta_len < mlen) + goto out_false; + + return true; + +out_false: + dev_err(ns->ctrl->device, + "%s: metadata length is small!\n", current->comm); + return false; +} + static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, @@ -194,6 +255,12 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, bio_set_dev(bio, bdev); if (bdev && meta_buffer && meta_len) { + if (!nvme_validate_passthru_meta(ns, nvme_req(req)->cmd, + meta_len, bufflen)) { + ret = -EINVAL; + goto out_unmap; + } + meta = nvme_add_user_metadata(req, meta_buffer, meta_len, meta_seed); if (IS_ERR(meta)) { @@ -203,6 +270,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, *metap = meta; } + /* guard another case when kernel memory is being used */ + if (bio->bi_private && ns && ns->features & NVME_NS_EXT_LBAS) { + if (!nvme_validate_passthru_meta(ns, nvme_req(req)->cmd, + meta_len, bufflen)) { + ret = -EINVAL; + goto out_unmap; + } + } + return ret; out_unmap: -- 2.25.1