Baoquan He <bhe@xxxxxxxxxx> writes: > Currently, vread() can read out vmalloc areas which is associated with > a vm_struct. While this doesn't work for areas created by vm_map_ram() > interface because it doesn't allocate a vm_struct. Then in vread(), > these areas will be skipped. > > Here, add a new function vb_vread() to read out areas managed by > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags > and handle them respectively. > > Stephen Brennan <stephen.s.brennan@xxxxxxxxxx> > Signed-off-by: Baoquan He <bhe@xxxxxxxxxx> > Link: https://lore.kernel.org/all/87ilk6gos2.fsf@xxxxxxxxxx/T/#u > --- > mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 51 insertions(+), 6 deletions(-) > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 41d82dc07e13..5a8d5659bfb0 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3518,6 +3518,46 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) > return copied; > } > > +static void vb_vread(char *buf, char *addr, int count) > +{ > + char *start; > + struct vmap_block *vb; > + unsigned long offset; > + unsigned int rs, re, n; > + > + offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); > + > + spin_lock(&vb->lock); > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > + spin_unlock(&vb->lock); > + memset(buf, 0, count); > + return; > + } > + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { > + if (!count) > + break; > + start = vmap_block_vaddr(vb->va->va_start, rs); > + if (addr < start) { > + if (count == 0) > + break; > + *buf = '\0'; > + buf++; > + addr++; > + count--; > + } > + n = (re - rs + 1) << PAGE_SHIFT; > + if (n > count) > + n = count; > + aligned_vread(buf, start, n); > + > + buf += n; > + addr += n; > + count -= n; > + } > + spin_unlock(&vb->lock); > +} > + > /** > * vread() - read vmalloc area in a safe way. > * @buf: buffer for reading data > @@ -3548,7 +3588,7 @@ long vread(char *buf, char *addr, unsigned long count) > struct vm_struct *vm; > char *vaddr, *buf_start = buf; > unsigned long buflen = count; > - unsigned long n; > + unsigned long n, size; > > addr = kasan_reset_tag(addr); > > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count) > if (!count) > break; > > - if (!va->vm) > + if (!(va->flags & VMAP_RAM) && !va->vm) > continue; > > vm = va->vm; > - vaddr = (char *) vm->addr; > - if (addr >= vaddr + get_vm_area_size(vm)) > + vaddr = (char *) va->va_start; > + size = vm ? get_vm_area_size(vm) : va_size(va); Hi Baoquan, Thanks for working on this. I tested your patches out by using drgn to debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call and print the virtual address to the kernel log so I can try to read that memory address in drgn. When I did this test, I got a panic on the above line of code. [ 167.101113] BUG: kernel NULL pointer dereference, address: 0000000000000013 [ 167.104538] #PF: supervisor read access in kernel mode [ 167.106446] #PF: error_code(0x0000) - not-present page [ 167.108474] PGD 0 P4D 0 [ 167.109311] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 167.111727] CPU: 3 PID: 7647 Comm: drgn Kdump: loaded Tainted: G OE 6.1.0-rc4.bugvreadtest.el8.dev02.x86_64 #1 [ 167.115076] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.5.1 06/16/2021 [ 167.117348] RIP: 0010:vread+0xaf/0x210 [ 167.118345] Code: 86 3e 01 00 00 48 85 db 0f 84 35 01 00 00 49 8d 47 28 48 3d 10 f8 a7 8f 0f 84 25 01 00 00 4d 89 f4 49 8b 57 38 48 85 d2 74 21 <48> 8b 42 10 f6 42 18 40 48 89 d6 49 8b 0f 48 8d b8 00 f0 ff ff 48 [ 167.123776] RSP: 0018:ffffaeb380a1fb90 EFLAGS: 00010206 [ 167.125669] RAX: ffff9853a1397b28 RBX: 0000000000000040 RCX: 0000000000000000 [ 167.128401] RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000000 [ 167.130948] RBP: ffffaeb382400000 R08: 0000000000000000 R09: 0000000000000000 [ 167.133372] R10: 0000000000000000 R11: 0000000000000000 R12: ffff985385877000 [ 167.135397] R13: 0000000000000040 R14: ffff985385877000 R15: ffff9853a1397b00 [ 167.137533] FS: 00007f71eae33b80(0000) GS:ffff9856afd80000(0000) knlGS:0000000000000000 [ 167.140210] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 167.142440] CR2: 0000000000000013 CR3: 000000012048a000 CR4: 00000000003506e0 [ 167.144640] Call Trace: [ 167.145494] <TASK> [ 167.146263] read_kcore+0x33a/0xa30 [ 167.147392] ? remove_entity_load_avg+0x2e/0x70 [ 167.148425] ? _raw_spin_unlock_irqrestore+0x11/0x60 [ 167.150657] ? __wake_up_common_lock+0x8b/0xd0 [ 167.152261] ? tty_set_termios+0x211/0x280 [ 167.153397] ? set_termios+0x16b/0x1d0 [ 167.154698] ? _raw_spin_unlock+0xe/0x40 [ 167.155737] ? wp_page_reuse+0x60/0x80 [ 167.157138] ? do_wp_page+0x169/0x3a0 [ 167.158752] ? pmd_pfn+0x9/0x50 [ 167.159645] ? __handle_mm_fault+0x3b0/0x690 [ 167.160837] ? inode_security+0x22/0x60 [ 167.161761] proc_reg_read+0x5a/0xb0 [ 167.162777] vfs_read+0xa7/0x320 [ 167.163512] ? handle_mm_fault+0xb6/0x2c0 [ 167.164400] __x64_sys_pread64+0x9c/0xd0 [ 167.165763] do_syscall_64+0x3f/0xa0 [ 167.167610] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 167.169951] RIP: 0033:0x7f71e9c123d7 I debugged the resulting core dump and found the reason: >>> stack_trace = prog.crashed_thread().stack_trace() >>> stack_trace #0 crash_setup_regs (./arch/x86/include/asm/kexec.h:95:3) #1 __crash_kexec (kernel/kexec_core.c:974:4) #2 panic (kernel/panic.c:330:3) #3 oops_end (arch/x86/kernel/dumpstack.c:379:3) #4 page_fault_oops (arch/x86/mm/fault.c:729:2) #5 handle_page_fault (arch/x86/mm/fault.c:1519:3) #6 exc_page_fault (arch/x86/mm/fault.c:1575:2) #7 asm_exc_page_fault+0x26/0x2b (./arch/x86/include/asm/idtentry.h:570) #8 get_vm_area_size (./include/linux/vmalloc.h:203:14) #9 vread (mm/vmalloc.c:3617:15) #10 read_kcore (fs/proc/kcore.c:510:4) #11 pde_read (fs/proc/inode.c:316:10) #12 proc_reg_read (fs/proc/inode.c:328:8) #13 vfs_read (fs/read_write.c:468:9) #14 ksys_pread64 (fs/read_write.c:665:10) #15 __do_sys_pread64 (fs/read_write.c:675:9) #16 __se_sys_pread64 (fs/read_write.c:672:1) #17 __x64_sys_pread64 (fs/read_write.c:672:1) #18 do_syscall_x64 (arch/x86/entry/common.c:50:14) #19 do_syscall_64 (arch/x86/entry/common.c:80:7) #20 entry_SYSCALL_64+0x9f/0x19b (arch/x86/entry/entry_64.S:120) #21 0x7f71e9c123d7 >>> stack_trace[9]["va"] *(struct vmap_area *)0xffff9853a1397b00 = { .va_start = (unsigned long)18446654684740452352, .va_end = (unsigned long)18446654684741500928, .rb_node = (struct rb_node){ .__rb_parent_color = (unsigned long)18446630083335569168, .rb_right = (struct rb_node *)0x0, .rb_left = (struct rb_node *)0x0, }, .list = (struct list_head){ .next = (struct list_head *)0xffff98538c403f28, .prev = (struct list_head *)0xffff98538c54e1e8, }, .subtree_max_size = (unsigned long)3, .vm = (struct vm_struct *)0x3, .flags = (unsigned long)3, } Since flags is in a union, it shadows "vm" and causes the condition to be true, and then get_vm_area_size() tries to follow the pointer defined by flags. I'm not sure if the fix is to have flags be a separate field inside vmap_area, or to have more careful handling in the vread path. Thanks, Stephen > + > + if (addr >= vaddr + size) > continue; > while (addr < vaddr) { > if (count == 0) > @@ -3584,10 +3626,13 @@ long vread(char *buf, char *addr, unsigned long count) > addr++; > count--; > } > - n = vaddr + get_vm_area_size(vm) - addr; > + n = vaddr + size - addr; > if (n > count) > n = count; > - if (!(vm->flags & VM_IOREMAP)) > + > + if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK)) > + vb_vread(buf, addr, n); > + else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP)) > aligned_vread(buf, addr, n); > else /* IOREMAP area is treated as memory hole */ > memset(buf, 0, n); > -- > 2.34.1