Re: [PATCH RFC 3/3] mm/vmalloc.c: allow vread() to read out vm_map_ram areas

Stephen Brennan <stephen.s.brennan@xxxxxxxxxx> · Wed, 09 Nov 2022 16:59:38 -0800

Baoquan He <bhe@xxxxxxxxxx> writes:
> Currently, vread() can read out vmalloc areas which is associated with
> a vm_struct. While this doesn't work for areas created by vm_map_ram()
> interface because it doesn't allocate a vm_struct. Then in vread(),
> these areas will be skipped.
>
> Here, add a new function vb_vread() to read out areas managed by
> vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> and handle  them respectively.
>
> Stephen Brennan <stephen.s.brennan@xxxxxxxxxx>
> Signed-off-by: Baoquan He <bhe@xxxxxxxxxx>
> Link: https://lore.kernel.org/all/87ilk6gos2.fsf@xxxxxxxxxx/T/#u
> ---
>  mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 51 insertions(+), 6 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 41d82dc07e13..5a8d5659bfb0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3518,6 +3518,46 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
>  	return copied;
>  }
>  
> +static void vb_vread(char *buf, char *addr, int count)
> +{
> +	char *start;
> +	struct vmap_block *vb;
> +	unsigned long offset;
> +	unsigned int rs, re, n;
> +
> +	offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
> +	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> +
> +	spin_lock(&vb->lock);
> +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> +		spin_unlock(&vb->lock);
> +		memset(buf, 0, count);
> +		return;
> +	}
> +	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> +		if (!count)
> +			break;
> +		start = vmap_block_vaddr(vb->va->va_start, rs);
> +		if (addr < start) {
> +			if (count == 0)
> +				break;
> +			*buf = '\0';
> +			buf++;
> +			addr++;
> +			count--;
> +		}
> +		n = (re - rs + 1) << PAGE_SHIFT;
> +		if (n > count)
> +			n = count;
> +		aligned_vread(buf, start, n);
> +
> +		buf += n;
> +		addr += n;
> +		count -= n;
> +	}
> +	spin_unlock(&vb->lock);
> +}
> +
>  /**
>   * vread() - read vmalloc area in a safe way.
>   * @buf:     buffer for reading data
> @@ -3548,7 +3588,7 @@ long vread(char *buf, char *addr, unsigned long count)
>  	struct vm_struct *vm;
>  	char *vaddr, *buf_start = buf;
>  	unsigned long buflen = count;
> -	unsigned long n;
> +	unsigned long n, size;
>  
>  	addr = kasan_reset_tag(addr);
>  
> @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count)
>  		if (!count)
>  			break;
>  
> -		if (!va->vm)
> +		if (!(va->flags & VMAP_RAM) && !va->vm)
>  			continue;
>  
>  		vm = va->vm;
> -		vaddr = (char *) vm->addr;
> -		if (addr >= vaddr + get_vm_area_size(vm))
> +		vaddr = (char *) va->va_start;
> +		size = vm ? get_vm_area_size(vm) : va_size(va);

Hi Baoquan,

Thanks for working on this. I tested your patches out by using drgn to
debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call
and print the virtual address to the kernel log so I can try to read
that memory address in drgn. When I did this test, I got a panic on the
above line of code.

[  167.101113] BUG: kernel NULL pointer dereference, address: 0000000000000013
[  167.104538] #PF: supervisor read access in kernel mode
[  167.106446] #PF: error_code(0x0000) - not-present page
[  167.108474] PGD 0 P4D 0
[  167.109311] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  167.111727] CPU: 3 PID: 7647 Comm: drgn Kdump: loaded Tainted: G           OE      6.1.0-rc4.bugvreadtest.el8.dev02.x86_64 #1
[  167.115076] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.5.1 06/16/2021
[  167.117348] RIP: 0010:vread+0xaf/0x210
[  167.118345] Code: 86 3e 01 00 00 48 85 db 0f 84 35 01 00 00 49 8d 47 28 48 3d 10 f8 a7 8f 0f 84 25 01 00 00 4d 89 f4 49 8b 57 38 48 85 d2 74 21 <48> 8b 42 10 f6 42 18 40 48 89 d6 49 8b 0f 48 8d b8 00 f0 ff ff 48
[  167.123776] RSP: 0018:ffffaeb380a1fb90 EFLAGS: 00010206
[  167.125669] RAX: ffff9853a1397b28 RBX: 0000000000000040 RCX: 0000000000000000
[  167.128401] RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000000
[  167.130948] RBP: ffffaeb382400000 R08: 0000000000000000 R09: 0000000000000000
[  167.133372] R10: 0000000000000000 R11: 0000000000000000 R12: ffff985385877000
[  167.135397] R13: 0000000000000040 R14: ffff985385877000 R15: ffff9853a1397b00
[  167.137533] FS:  00007f71eae33b80(0000) GS:ffff9856afd80000(0000) knlGS:0000000000000000
[  167.140210] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  167.142440] CR2: 0000000000000013 CR3: 000000012048a000 CR4: 00000000003506e0
[  167.144640] Call Trace:
[  167.145494]  <TASK>
[  167.146263]  read_kcore+0x33a/0xa30
[  167.147392]  ? remove_entity_load_avg+0x2e/0x70
[  167.148425]  ? _raw_spin_unlock_irqrestore+0x11/0x60
[  167.150657]  ? __wake_up_common_lock+0x8b/0xd0
[  167.152261]  ? tty_set_termios+0x211/0x280
[  167.153397]  ? set_termios+0x16b/0x1d0
[  167.154698]  ? _raw_spin_unlock+0xe/0x40
[  167.155737]  ? wp_page_reuse+0x60/0x80
[  167.157138]  ? do_wp_page+0x169/0x3a0
[  167.158752]  ? pmd_pfn+0x9/0x50
[  167.159645]  ? __handle_mm_fault+0x3b0/0x690
[  167.160837]  ? inode_security+0x22/0x60
[  167.161761]  proc_reg_read+0x5a/0xb0
[  167.162777]  vfs_read+0xa7/0x320
[  167.163512]  ? handle_mm_fault+0xb6/0x2c0
[  167.164400]  __x64_sys_pread64+0x9c/0xd0
[  167.165763]  do_syscall_64+0x3f/0xa0
[  167.167610]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[  167.169951] RIP: 0033:0x7f71e9c123d7

I debugged the resulting core dump and found the reason:

>>> stack_trace = prog.crashed_thread().stack_trace()
>>> stack_trace
#0  crash_setup_regs (./arch/x86/include/asm/kexec.h:95:3)
#1  __crash_kexec (kernel/kexec_core.c:974:4)
#2  panic (kernel/panic.c:330:3)
#3  oops_end (arch/x86/kernel/dumpstack.c:379:3)
#4  page_fault_oops (arch/x86/mm/fault.c:729:2)
#5  handle_page_fault (arch/x86/mm/fault.c:1519:3)
#6  exc_page_fault (arch/x86/mm/fault.c:1575:2)
#7  asm_exc_page_fault+0x26/0x2b (./arch/x86/include/asm/idtentry.h:570)
#8  get_vm_area_size (./include/linux/vmalloc.h:203:14)
#9  vread (mm/vmalloc.c:3617:15)
#10 read_kcore (fs/proc/kcore.c:510:4)
#11 pde_read (fs/proc/inode.c:316:10)
#12 proc_reg_read (fs/proc/inode.c:328:8)
#13 vfs_read (fs/read_write.c:468:9)
#14 ksys_pread64 (fs/read_write.c:665:10)
#15 __do_sys_pread64 (fs/read_write.c:675:9)
#16 __se_sys_pread64 (fs/read_write.c:672:1)
#17 __x64_sys_pread64 (fs/read_write.c:672:1)
#18 do_syscall_x64 (arch/x86/entry/common.c:50:14)
#19 do_syscall_64 (arch/x86/entry/common.c:80:7)
#20 entry_SYSCALL_64+0x9f/0x19b (arch/x86/entry/entry_64.S:120)
#21 0x7f71e9c123d7
>>> stack_trace[9]["va"]
*(struct vmap_area *)0xffff9853a1397b00 = {
        .va_start = (unsigned long)18446654684740452352,
        .va_end = (unsigned long)18446654684741500928,
        .rb_node = (struct rb_node){
                .__rb_parent_color = (unsigned long)18446630083335569168,
                .rb_right = (struct rb_node *)0x0,
                .rb_left = (struct rb_node *)0x0,
        },
        .list = (struct list_head){
                .next = (struct list_head *)0xffff98538c403f28,
                .prev = (struct list_head *)0xffff98538c54e1e8,
        },
        .subtree_max_size = (unsigned long)3,
        .vm = (struct vm_struct *)0x3,
        .flags = (unsigned long)3,
}

Since flags is in a union, it shadows "vm" and causes the condition to
be true, and then get_vm_area_size() tries to follow the pointer defined
by flags. I'm not sure if the fix is to have flags be a separate field
inside vmap_area, or to have more careful handling in the vread path.

Thanks,
Stephen

> +
> +		if (addr >= vaddr + size)
>  			continue;
>  		while (addr < vaddr) {
>  			if (count == 0)
> @@ -3584,10 +3626,13 @@ long vread(char *buf, char *addr, unsigned long count)
>  			addr++;
>  			count--;
>  		}
> -		n = vaddr + get_vm_area_size(vm) - addr;
> +		n = vaddr + size - addr;
>  		if (n > count)
>  			n = count;
> -		if (!(vm->flags & VM_IOREMAP))
> +
> +		if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK))
> +			vb_vread(buf, addr, n);
> +		else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP))
>  			aligned_vread(buf, addr, n);
>  		else /* IOREMAP area is treated as memory hole */
>  			memset(buf, 0, n);
> -- 
> 2.34.1