On Fri, 24 Nov 2023 at 02:28, Christian Brauner <brauner@xxxxxxxxxx> wrote: > > * Fix a bug introduced with the iov_iter rework from last cycle. > > This broke /proc/kcore by copying too much and without the correct > offset. Ugh. I think the whole /proc/kcore vmalloc handling is just COMPLETELY broken. It does this: /* * vmalloc uses spinlocks, so we optimistically try to * read memory. If this fails, fault pages in and try * again until we are done. */ while (true) { read += vread_iter(iter, src, left); if (read == tsz) break; src += read; left -= read; if (fault_in_iov_iter_writeable(iter, left)) { ret = -EFAULT; goto out; } } and that is just broken beyond words for two totally independent reasons: (a) vread_iter() looks like it can fail because of not having a source, and return 0 (I dunno - it seems to try to avoid that, but it all looks pretty dodgy) At that point fault_in_iov_iter_writeable() will try to fault in the destination, which may work just fine, but if the source was the problem, you'd have an endless loop. (b) That "read += X" is completely broken anyway. It should be just a "=". So that whole loop is crazy broken, and only works for the case where you get it all in one go. This code is crap. Now, I think it all works in practice for one simple reason: I doubt anybody uses this (and it looks like the callees in the while loop try very hard to always fill the whole area - maybe people noticed the first bug and tried to work around it that way). I guess there is at least one test program, but it presumably doesn't trigger or care about the bugs here. But I think we can get rid of this all, and just deal with the KCORE_VMALLOC case exactly the same way we already deal with VMEMMAP and TEXT: by just doing copy_from_kernel_nofault() into a bounce buffer, and then doing a regular _copy_to_iter() or whatever. NOTE! I looked at the code, and threw up in my mouth a little, and maybe I missed something. Maybe it all works fine. But Omar - since you found the original problem, may I implore you to test this attached patch? I just like how the patch looks: 6 files changed, 1 insertion(+), 368 deletions(-) and those 350+ deleted lines really looked disgusting to me. This patch is on top of the pull I did, because obviously the fix in that pull was correct, I just think we should go further and get rid of this whole mess entirely. Linus
fs/proc/kcore.c | 26 +---- include/linux/uio.h | 3 - include/linux/vmalloc.h | 3 - lib/iov_iter.c | 33 ------ mm/nommu.c | 9 -- mm/vmalloc.c | 295 ------------------------------------------------ 6 files changed, 1 insertion(+), 368 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6422e569b080..83a39f4d1ddc 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -504,31 +504,6 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) } switch (m->type) { - case KCORE_VMALLOC: - { - const char *src = (char *)start; - size_t read = 0, left = tsz; - - /* - * vmalloc uses spinlocks, so we optimistically try to - * read memory. If this fails, fault pages in and try - * again until we are done. - */ - while (true) { - read += vread_iter(iter, src, left); - if (read == tsz) - break; - - src += read; - left -= read; - - if (fault_in_iov_iter_writeable(iter, left)) { - ret = -EFAULT; - goto out; - } - } - break; - } case KCORE_USER: /* User page is handled prior to normal kernel page: */ if (copy_to_iter((char *)start, tsz, iter) != tsz) { @@ -555,6 +530,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) break; } fallthrough; + case KCORE_VMALLOC: case KCORE_VMEMMAP: case KCORE_TEXT: /* diff --git a/include/linux/uio.h b/include/linux/uio.h index b6214cbf2a43..993a6bd8bdd3 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -187,9 +187,6 @@ static inline size_t copy_folio_from_iter_atomic(struct folio *folio, return copy_page_from_iter_atomic(&folio->page, offset, bytes, i); } -size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, - size_t bytes, struct iov_iter *i); - static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8dd..f8885045f4d2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -247,9 +247,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #endif -/* for /proc/kcore */ -extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); - /* * Internals. Don't use.. */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 8ff6824a1005..6d2b79973622 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -394,39 +394,6 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_to_iter); -size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, - struct iov_iter *i) -{ - size_t res = 0; - - if (!page_copy_sane(page, offset, bytes)) - return 0; - if (WARN_ON_ONCE(i->data_source)) - return 0; - page += offset / PAGE_SIZE; // first subpage - offset %= PAGE_SIZE; - while (1) { - void *kaddr = kmap_local_page(page); - size_t n = min(bytes, (size_t)PAGE_SIZE - offset); - - n = iterate_and_advance(i, n, kaddr + offset, - copy_to_user_iter_nofault, - memcpy_to_iter); - kunmap_local(kaddr); - res += n; - bytes -= n; - if (!bytes || !n) - break; - offset += n; - if (offset == PAGE_SIZE) { - page++; - offset = 0; - } - } - return res; -} -EXPORT_SYMBOL(copy_page_to_iter_nofault); - size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { diff --git a/mm/nommu.c b/mm/nommu.c index b6dc558d3144..1612b3a601fd 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -199,15 +199,6 @@ unsigned long vmalloc_to_pfn(const void *addr) } EXPORT_SYMBOL(vmalloc_to_pfn); -long vread_iter(struct iov_iter *iter, const char *addr, size_t count) -{ - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - return copy_to_iter(addr, count, iter); -} - /* * vmalloc - allocate virtually contiguous memory * diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d12a17fc0c17..79889a10e18d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -802,31 +802,6 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } -/* Look up the first VA which satisfies addr < va_end, NULL if none. */ -static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) -{ - struct vmap_area *va = NULL; - struct rb_node *n = vmap_area_root.rb_node; - - addr = (unsigned long)kasan_reset_tag((void *)addr); - - while (n) { - struct vmap_area *tmp; - - tmp = rb_entry(n, struct vmap_area, rb_node); - if (tmp->va_end > addr) { - va = tmp; - if (tmp->va_start <= addr) - break; - - n = n->rb_left; - } else - n = n->rb_right; - } - - return va; -} - static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; @@ -3562,276 +3537,6 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user); -/* - * Atomically zero bytes in the iterator. - * - * Returns the number of zeroed bytes. - */ -static size_t zero_iter(struct iov_iter *iter, size_t count) -{ - size_t remains = count; - - while (remains > 0) { - size_t num, copied; - - num = min_t(size_t, remains, PAGE_SIZE); - copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); - remains -= copied; - - if (copied < num) - break; - } - - return count - remains; -} - -/* - * small helper routine, copy contents to iter from addr. - * If the page is not present, fill zero. - * - * Returns the number of copied bytes. - */ -static size_t aligned_vread_iter(struct iov_iter *iter, - const char *addr, size_t count) -{ - size_t remains = count; - struct page *page; - - while (remains > 0) { - unsigned long offset, length; - size_t copied = 0; - - offset = offset_in_page(addr); - length = PAGE_SIZE - offset; - if (length > remains) - length = remains; - page = vmalloc_to_page(addr); - /* - * To do safe access to this _mapped_ area, we need lock. But - * adding lock here means that we need to add overhead of - * vmalloc()/vfree() calls for this _debug_ interface, rarely - * used. Instead of that, we'll use an local mapping via - * copy_page_to_iter_nofault() and accept a small overhead in - * this access function. - */ - if (page) - copied = copy_page_to_iter_nofault(page, offset, - length, iter); - else - copied = zero_iter(iter, length); - - addr += copied; - remains -= copied; - - if (copied != length) - break; - } - - return count - remains; -} - -/* - * Read from a vm_map_ram region of memory. - * - * Returns the number of copied bytes. - */ -static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, - size_t count, unsigned long flags) -{ - char *start; - struct vmap_block *vb; - struct xarray *xa; - unsigned long offset; - unsigned int rs, re; - size_t remains, n; - - /* - * If it's area created by vm_map_ram() interface directly, but - * not further subdividing and delegating management to vmap_block, - * handle it here. - */ - if (!(flags & VMAP_BLOCK)) - return aligned_vread_iter(iter, addr, count); - - remains = count; - - /* - * Area is split into regions and tracked with vmap_block, read out - * each region and zero fill the hole between regions. - */ - xa = addr_to_vb_xa((unsigned long) addr); - vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr)); - if (!vb) - goto finished_zero; - - spin_lock(&vb->lock); - if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { - spin_unlock(&vb->lock); - goto finished_zero; - } - - for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { - size_t copied; - - if (remains == 0) - goto finished; - - start = vmap_block_vaddr(vb->va->va_start, rs); - - if (addr < start) { - size_t to_zero = min_t(size_t, start - addr, remains); - size_t zeroed = zero_iter(iter, to_zero); - - addr += zeroed; - remains -= zeroed; - - if (remains == 0 || zeroed != to_zero) - goto finished; - } - - /*it could start reading from the middle of used region*/ - offset = offset_in_page(addr); - n = ((re - rs + 1) << PAGE_SHIFT) - offset; - if (n > remains) - n = remains; - - copied = aligned_vread_iter(iter, start + offset, n); - - addr += copied; - remains -= copied; - - if (copied != n) - goto finished; - } - - spin_unlock(&vb->lock); - -finished_zero: - /* zero-fill the left dirty or free regions */ - return count - remains + zero_iter(iter, remains); -finished: - /* We couldn't copy/zero everything */ - spin_unlock(&vb->lock); - return count - remains; -} - -/** - * vread_iter() - read vmalloc area in a safe way to an iterator. - * @iter: the iterator to which data should be written. - * @addr: vm address. - * @count: number of bytes to be read. - * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from that area to a given buffer. If the given memory range - * of [addr...addr+count) includes some valid address, data is copied to - * proper area of @buf. If there are memory holes, they'll be zero-filled. - * IOREMAP area is treated as memory hole and no copy is done. - * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vread() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any information, as /proc/kcore. - * - * Return: number of bytes for which addr and buf should be increased - * (same number as @count) or %0 if [addr...addr+count) doesn't - * include any intersection with valid vmalloc area - */ -long vread_iter(struct iov_iter *iter, const char *addr, size_t count) -{ - struct vmap_area *va; - struct vm_struct *vm; - char *vaddr; - size_t n, size, flags, remains; - - addr = kasan_reset_tag(addr); - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - remains = count; - - spin_lock(&vmap_area_lock); - va = find_vmap_area_exceed_addr((unsigned long)addr); - if (!va) - goto finished_zero; - - /* no intersects with alive vmap_area */ - if ((unsigned long)addr + remains <= va->va_start) - goto finished_zero; - - list_for_each_entry_from(va, &vmap_area_list, list) { - size_t copied; - - if (remains == 0) - goto finished; - - vm = va->vm; - flags = va->flags & VMAP_FLAGS_MASK; - /* - * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need - * be set together with VMAP_RAM. - */ - WARN_ON(flags == VMAP_BLOCK); - - if (!vm && !flags) - continue; - - if (vm && (vm->flags & VM_UNINITIALIZED)) - continue; - - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); - - vaddr = (char *) va->va_start; - size = vm ? get_vm_area_size(vm) : va_size(va); - - if (addr >= vaddr + size) - continue; - - if (addr < vaddr) { - size_t to_zero = min_t(size_t, vaddr - addr, remains); - size_t zeroed = zero_iter(iter, to_zero); - - addr += zeroed; - remains -= zeroed; - - if (remains == 0 || zeroed != to_zero) - goto finished; - } - - n = vaddr + size - addr; - if (n > remains) - n = remains; - - if (flags & VMAP_RAM) - copied = vmap_ram_vread_iter(iter, addr, n, flags); - else if (!(vm && (vm->flags & VM_IOREMAP))) - copied = aligned_vread_iter(iter, addr, n); - else /* IOREMAP area is treated as memory hole */ - copied = zero_iter(iter, n); - - addr += copied; - remains -= copied; - - if (copied != n) - goto finished; - } - -finished_zero: - spin_unlock(&vmap_area_lock); - /* zero-fill memory holes */ - return count - remains + zero_iter(iter, remains); -finished: - /* Nothing remains, or We couldn't copy/zero everything. */ - spin_unlock(&vmap_area_lock); - - return count - remains; -} - /** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover