Re: [GIT PULL] vfs fixes

Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> · Fri, 24 Nov 2023 10:25:15 -0800

On Fri, 24 Nov 2023 at 02:28, Christian Brauner <brauner@xxxxxxxxxx> wrote:
>
> * Fix a bug introduced with the iov_iter rework from last cycle.
>
>   This broke /proc/kcore by copying too much and without the correct
>   offset.

Ugh. I think the whole /proc/kcore vmalloc handling is just COMPLETELY broken.

It does this:

        /*
         * vmalloc uses spinlocks, so we optimistically try to
         * read memory. If this fails, fault pages in and try
         * again until we are done.
         */
        while (true) {
                read += vread_iter(iter, src, left);
                if (read == tsz)
                        break;

                src += read;
                left -= read;

                if (fault_in_iov_iter_writeable(iter, left)) {
                        ret = -EFAULT;
                        goto out;
                }
        }


and that is just broken beyond words for two totally independent reasons:

 (a) vread_iter() looks like it can fail because of not having a
source, and return 0 (I dunno - it seems to try to avoid that, but it
all looks pretty dodgy)

       At that point fault_in_iov_iter_writeable() will try to fault
in the destination, which may work just fine, but if the source was
the problem, you'd have an endless loop.

 (b) That "read += X" is completely broken anyway. It should be just a
"=". So that whole loop is crazy broken, and only works for the case
where you get it all in one go. This code is crap.

Now, I think it all works in practice for one simple reason: I doubt
anybody uses this (and it looks like the callees in the while loop try
very hard to always fill the whole area - maybe people noticed the
first bug and tried to work around it that way).

I guess there is at least one test program, but it presumably doesn't
trigger or care about the bugs here.

But I think we can get rid of this all, and just deal with the
KCORE_VMALLOC case exactly the same way we already deal with VMEMMAP
and TEXT: by just doing copy_from_kernel_nofault() into a bounce
buffer, and then doing a regular _copy_to_iter() or whatever.

NOTE! I looked at the code, and threw up in my mouth a little, and
maybe I missed something. Maybe it all works fine. But Omar - since
you found the original problem, may I implore you to test this
attached patch?

I just like how the patch looks:

 6 files changed, 1 insertion(+), 368 deletions(-)

and those 350+ deleted lines really looked disgusting to me.

This patch is on top of the pull I did, because obviously the fix in
that pull was correct, I just think we should go further and get rid
of this whole mess entirely.

                Linus
 fs/proc/kcore.c         |  26 +----
 include/linux/uio.h     |   3 -
 include/linux/vmalloc.h |   3 -
 lib/iov_iter.c          |  33 ------
 mm/nommu.c              |   9 --
 mm/vmalloc.c            | 295 ------------------------------------------------
 6 files changed, 1 insertion(+), 368 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6422e569b080..83a39f4d1ddc 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -504,31 +504,6 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 		}
 
 		switch (m->type) {
-		case KCORE_VMALLOC:
-		{
-			const char *src = (char *)start;
-			size_t read = 0, left = tsz;
-
-			/*
-			 * vmalloc uses spinlocks, so we optimistically try to
-			 * read memory. If this fails, fault pages in and try
-			 * again until we are done.
-			 */
-			while (true) {
-				read += vread_iter(iter, src, left);
-				if (read == tsz)
-					break;
-
-				src += read;
-				left -= read;
-
-				if (fault_in_iov_iter_writeable(iter, left)) {
-					ret = -EFAULT;
-					goto out;
-				}
-			}
-			break;
-		}
 		case KCORE_USER:
 			/* User page is handled prior to normal kernel page: */
 			if (copy_to_iter((char *)start, tsz, iter) != tsz) {
@@ -555,6 +530,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 				break;
 			}
 			fallthrough;
+		case KCORE_VMALLOC:
 		case KCORE_VMEMMAP:
 		case KCORE_TEXT:
 			/*
diff --git a/include/linux/uio.h b/include/linux/uio.h
index b6214cbf2a43..993a6bd8bdd3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -187,9 +187,6 @@ static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
 	return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
 }
 
-size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
-				 size_t bytes, struct iov_iter *i);
-
 static __always_inline __must_check
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..f8885045f4d2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -247,9 +247,6 @@ static inline void set_vm_flush_reset_perms(void *addr)
 }
 #endif
 
-/* for /proc/kcore */
-extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
-
 /*
  *	Internals.  Don't use..
  */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 8ff6824a1005..6d2b79973622 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -394,39 +394,6 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 }
 EXPORT_SYMBOL(copy_page_to_iter);
 
-size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
-				 struct iov_iter *i)
-{
-	size_t res = 0;
-
-	if (!page_copy_sane(page, offset, bytes))
-		return 0;
-	if (WARN_ON_ONCE(i->data_source))
-		return 0;
-	page += offset / PAGE_SIZE; // first subpage
-	offset %= PAGE_SIZE;
-	while (1) {
-		void *kaddr = kmap_local_page(page);
-		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
-
-		n = iterate_and_advance(i, n, kaddr + offset,
-					copy_to_user_iter_nofault,
-					memcpy_to_iter);
-		kunmap_local(kaddr);
-		res += n;
-		bytes -= n;
-		if (!bytes || !n)
-			break;
-		offset += n;
-		if (offset == PAGE_SIZE) {
-			page++;
-			offset = 0;
-		}
-	}
-	return res;
-}
-EXPORT_SYMBOL(copy_page_to_iter_nofault);
-
 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
diff --git a/mm/nommu.c b/mm/nommu.c
index b6dc558d3144..1612b3a601fd 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -199,15 +199,6 @@ unsigned long vmalloc_to_pfn(const void *addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
-{
-	/* Don't allow overflow */
-	if ((unsigned long) addr + count < count)
-		count = -(unsigned long) addr;
-
-	return copy_to_iter(addr, count, iter);
-}
-
 /*
  *	vmalloc  -  allocate virtually contiguous memory
  *
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..79889a10e18d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -802,31 +802,6 @@ unsigned long vmalloc_nr_pages(void)
 	return atomic_long_read(&nr_vmalloc_pages);
 }
 
-/* Look up the first VA which satisfies addr < va_end, NULL if none. */
-static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
-{
-	struct vmap_area *va = NULL;
-	struct rb_node *n = vmap_area_root.rb_node;
-
-	addr = (unsigned long)kasan_reset_tag((void *)addr);
-
-	while (n) {
-		struct vmap_area *tmp;
-
-		tmp = rb_entry(n, struct vmap_area, rb_node);
-		if (tmp->va_end > addr) {
-			va = tmp;
-			if (tmp->va_start <= addr)
-				break;
-
-			n = n->rb_left;
-		} else
-			n = n->rb_right;
-	}
-
-	return va;
-}
-
 static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
 {
 	struct rb_node *n = root->rb_node;
@@ -3562,276 +3537,6 @@ void *vmalloc_32_user(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_32_user);
 
-/*
- * Atomically zero bytes in the iterator.
- *
- * Returns the number of zeroed bytes.
- */
-static size_t zero_iter(struct iov_iter *iter, size_t count)
-{
-	size_t remains = count;
-
-	while (remains > 0) {
-		size_t num, copied;
-
-		num = min_t(size_t, remains, PAGE_SIZE);
-		copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
-		remains -= copied;
-
-		if (copied < num)
-			break;
-	}
-
-	return count - remains;
-}
-
-/*
- * small helper routine, copy contents to iter from addr.
- * If the page is not present, fill zero.
- *
- * Returns the number of copied bytes.
- */
-static size_t aligned_vread_iter(struct iov_iter *iter,
-				 const char *addr, size_t count)
-{
-	size_t remains = count;
-	struct page *page;
-
-	while (remains > 0) {
-		unsigned long offset, length;
-		size_t copied = 0;
-
-		offset = offset_in_page(addr);
-		length = PAGE_SIZE - offset;
-		if (length > remains)
-			length = remains;
-		page = vmalloc_to_page(addr);
-		/*
-		 * To do safe access to this _mapped_ area, we need lock. But
-		 * adding lock here means that we need to add overhead of
-		 * vmalloc()/vfree() calls for this _debug_ interface, rarely
-		 * used. Instead of that, we'll use an local mapping via
-		 * copy_page_to_iter_nofault() and accept a small overhead in
-		 * this access function.
-		 */
-		if (page)
-			copied = copy_page_to_iter_nofault(page, offset,
-							   length, iter);
-		else
-			copied = zero_iter(iter, length);
-
-		addr += copied;
-		remains -= copied;
-
-		if (copied != length)
-			break;
-	}
-
-	return count - remains;
-}
-
-/*
- * Read from a vm_map_ram region of memory.
- *
- * Returns the number of copied bytes.
- */
-static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
-				  size_t count, unsigned long flags)
-{
-	char *start;
-	struct vmap_block *vb;
-	struct xarray *xa;
-	unsigned long offset;
-	unsigned int rs, re;
-	size_t remains, n;
-
-	/*
-	 * If it's area created by vm_map_ram() interface directly, but
-	 * not further subdividing and delegating management to vmap_block,
-	 * handle it here.
-	 */
-	if (!(flags & VMAP_BLOCK))
-		return aligned_vread_iter(iter, addr, count);
-
-	remains = count;
-
-	/*
-	 * Area is split into regions and tracked with vmap_block, read out
-	 * each region and zero fill the hole between regions.
-	 */
-	xa = addr_to_vb_xa((unsigned long) addr);
-	vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
-	if (!vb)
-		goto finished_zero;
-
-	spin_lock(&vb->lock);
-	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
-		spin_unlock(&vb->lock);
-		goto finished_zero;
-	}
-
-	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
-		size_t copied;
-
-		if (remains == 0)
-			goto finished;
-
-		start = vmap_block_vaddr(vb->va->va_start, rs);
-
-		if (addr < start) {
-			size_t to_zero = min_t(size_t, start - addr, remains);
-			size_t zeroed = zero_iter(iter, to_zero);
-
-			addr += zeroed;
-			remains -= zeroed;
-
-			if (remains == 0 || zeroed != to_zero)
-				goto finished;
-		}
-
-		/*it could start reading from the middle of used region*/
-		offset = offset_in_page(addr);
-		n = ((re - rs + 1) << PAGE_SHIFT) - offset;
-		if (n > remains)
-			n = remains;
-
-		copied = aligned_vread_iter(iter, start + offset, n);
-
-		addr += copied;
-		remains -= copied;
-
-		if (copied != n)
-			goto finished;
-	}
-
-	spin_unlock(&vb->lock);
-
-finished_zero:
-	/* zero-fill the left dirty or free regions */
-	return count - remains + zero_iter(iter, remains);
-finished:
-	/* We couldn't copy/zero everything */
-	spin_unlock(&vb->lock);
-	return count - remains;
-}
-
-/**
- * vread_iter() - read vmalloc area in a safe way to an iterator.
- * @iter:         the iterator to which data should be written.
- * @addr:         vm address.
- * @count:        number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from that area to a given buffer. If the given memory range
- * of [addr...addr+count) includes some valid address, data is copied to
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vread() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /proc/kcore.
- *
- * Return: number of bytes for which addr and buf should be increased
- * (same number as @count) or %0 if [addr...addr+count) doesn't
- * include any intersection with valid vmalloc area
- */
-long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
-{
-	struct vmap_area *va;
-	struct vm_struct *vm;
-	char *vaddr;
-	size_t n, size, flags, remains;
-
-	addr = kasan_reset_tag(addr);
-
-	/* Don't allow overflow */
-	if ((unsigned long) addr + count < count)
-		count = -(unsigned long) addr;
-
-	remains = count;
-
-	spin_lock(&vmap_area_lock);
-	va = find_vmap_area_exceed_addr((unsigned long)addr);
-	if (!va)
-		goto finished_zero;
-
-	/* no intersects with alive vmap_area */
-	if ((unsigned long)addr + remains <= va->va_start)
-		goto finished_zero;
-
-	list_for_each_entry_from(va, &vmap_area_list, list) {
-		size_t copied;
-
-		if (remains == 0)
-			goto finished;
-
-		vm = va->vm;
-		flags = va->flags & VMAP_FLAGS_MASK;
-		/*
-		 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
-		 * be set together with VMAP_RAM.
-		 */
-		WARN_ON(flags == VMAP_BLOCK);
-
-		if (!vm && !flags)
-			continue;
-
-		if (vm && (vm->flags & VM_UNINITIALIZED))
-			continue;
-
-		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
-		smp_rmb();
-
-		vaddr = (char *) va->va_start;
-		size = vm ? get_vm_area_size(vm) : va_size(va);
-
-		if (addr >= vaddr + size)
-			continue;
-
-		if (addr < vaddr) {
-			size_t to_zero = min_t(size_t, vaddr - addr, remains);
-			size_t zeroed = zero_iter(iter, to_zero);
-
-			addr += zeroed;
-			remains -= zeroed;
-
-			if (remains == 0 || zeroed != to_zero)
-				goto finished;
-		}
-
-		n = vaddr + size - addr;
-		if (n > remains)
-			n = remains;
-
-		if (flags & VMAP_RAM)
-			copied = vmap_ram_vread_iter(iter, addr, n, flags);
-		else if (!(vm && (vm->flags & VM_IOREMAP)))
-			copied = aligned_vread_iter(iter, addr, n);
-		else /* IOREMAP area is treated as memory hole */
-			copied = zero_iter(iter, n);
-
-		addr += copied;
-		remains -= copied;
-
-		if (copied != n)
-			goto finished;
-	}
-
-finished_zero:
-	spin_unlock(&vmap_area_lock);
-	/* zero-fill memory holes */
-	return count - remains + zero_iter(iter, remains);
-finished:
-	/* Nothing remains, or We couldn't copy/zero everything. */
-	spin_unlock(&vmap_area_lock);
-
-	return count - remains;
-}
-
 /**
  * remap_vmalloc_range_partial - map vmalloc pages to userspace
  * @vma:		vma to cover