Re: sendfile(2) erroneously yields EINVAL on too large counts

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Jan!

On Thu, Feb 15, 2024 at 02:49:05PM +0100, Jan Engelhardt wrote:
> 
> Observed:
> The following program below leads to sendfile returning -1 and setting 
> errno=EINVAL.
> 
> Expected:
> Return 0.

I disagree.  The program has some problems, and should report an error.

> System: Linux 6.7.4 amd64 glibc-2.39
> 
> Rationale:
> 
> As per man-pages 6.60's sendfile.2 page:
> 
>        EINVAL Descriptor is not valid or locked, or an mmap(2)-like 
>               operation is not available for in_fd, or count is 
>               negative.
> 
> (Invalid descriptors should yield EBADF instead, I think.)
> mmap is probably functional, since the testcase works if write() calls 
> are removed.
> count is not negative.

count + file offset *is* negative.  You forgot to lseek(2).

> It appears that there may be a
> `src offset + count > SSIZE_MAX || dst offset + count > SSIZE_MAX`
> check in the kernel somewhere,

There are several.  See at the bottom.

> which sounds an awful lot like the documented EOVERFLOW behavior:
> 
>        EOVERFLOW
>               count is too large, the operation would result in
>               exceeding the maximum size of either the input file or
>               the output file.
> 
> but the reported error is EINVAL rather than EOVERFLOW.  Moreover, the
> (actual) result from this testcase does not go above a few bytes
> anyhow, so should not signal an overflow anyway.

The kernel detects that offset+count would overflow, and aborts early.
That's actually a good thing.  Otherwise, we wouldn't have noticed that
the program is missing an lseek(2) call until much later.  Also, given
addition of count+offset would cause overflow, that is, undefined
behavior, it's better to not even start.  Otherwise, it gets tricky to
write code that doesn't invoke UB.

(By inspecting the kernel code I'm not sure if it avoids UB; I think it
might be triggering UB; let me debug that and come with an update.)

> #define _GNU_SOURCE 1
> #include <errno.h>
> #include <fcntl.h>
> #include <limits.h>
> #include <stdio.h>
> #include <string.h>
> #include <unistd.h>
> #include <sys/sendfile.h>
> int main(int argc, char **argv)
> {
>         int src = open(".", O_RDWR | O_TMPFILE, 0666);
>         write(src, "1234", 4);
>         int dst = open(".", O_RDWR | O_TMPFILE, 0666);
>         write(src, "1234", 4);
>         ssize_t ret = sendfile(dst, src, NULL, SSIZE_MAX);

Even if you pass SSIZE_MAX - 8, which will be accepted by the kernel,
this call will always copy exactly 0 bytes.  Rationale: the file
descriptor is positioned at the end of the source file.

>         printf("%ld\n", (long)ret);
>         if (ret < 0)
>                 printf("%s\n", strerror(errno));
>         return 0;
> }
> 
> As it stands, a sendfile() user just wanting to shovel src to dst
> cannot just "fire-and-forget" but has to compute a suitable count 
> beforehand.

You can.  But you need to put the file descriptor at the begining of the
file (or if you really want to start reading mid-file, you'll need to
pass SSIZE_MAX-offset).

> Is this really what we want?

I'm not entirely sure if the kernel should report EINVAL or EOVERFLOW,
nor what should the manual page specify.

But regarding the fire-and-offset question, it's possible to do it:


alx@debian:~/tmp$ cat sf.c 
#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <limits.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/sendfile.h>


int
main(void)
{
	int      src, dst;
	ssize_t  ret;
	char     buf[BUFSIZ];

	src = open(".", O_RDWR | O_TMPFILE, 0666);
	if (src == -1)
		err(EXIT_FAILURE, "open");
	dst = open(".", O_RDWR | O_TMPFILE, 0666);
	if (dst == -1)
		err(EXIT_FAILURE, "open");

	ret = write(src, "1234", 4);
	if (ret != 4)
		err(EXIT_FAILURE, "write: %zd", ret);
	write(src, "asd\n", 4);
	if (ret != 4)
		err(EXIT_FAILURE, "write: %zd", ret);

	if (lseek(src, 0, SEEK_SET) == -1)
		err(EXIT_FAILURE, "lseek");
	ret = sendfile(dst, src, NULL, SSIZE_MAX);
	if (ret != 8)
		err(EXIT_FAILURE, "sendfile: %zd", ret);

	if (lseek(dst, 0, SEEK_SET) == -1)
		err(EXIT_FAILURE, "lseek");
	ret = read(dst, buf, BUFSIZ);
	if (ret != 8)
		err(EXIT_FAILURE, "read: %zd", ret);

	ret = write(STDOUT_FILENO, buf, ret);
	if (ret != 8)
		err(EXIT_FAILURE, "write: %zd", ret);
	return 0;
}
alx@debian:~/tmp$ cc -Wall -Wextra sf.c 
alx@debian:~/tmp$ ./a.out 
1234asd
alx@debian:~/tmp$

Or the same thing without error handling, to make it more readable:

alx@debian:~/tmp$ cat sf2.c 
#define _GNU_SOURCE
#include <fcntl.h>
#include <limits.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/sendfile.h>


int
main(void)
{
	int      src, dst;
	ssize_t  ret;
	char     buf[BUFSIZ];

	src = open(".", O_RDWR | O_TMPFILE, 0666);
	dst = open(".", O_RDWR | O_TMPFILE, 0666);

	ret = write(src, "1234", 4);
	write(src, "asd\n", 4);

	lseek(src, 0, SEEK_SET);
	sendfile(dst, src, NULL, SSIZE_MAX);

	lseek(dst, 0, SEEK_SET);
	ret = read(dst, buf, BUFSIZ);
	write(STDOUT_FILENO, buf, ret);
	return 0;
}
alx@debian:~/tmp$ cc -Wall -Wextra sf2.c 
alx@debian:~/tmp$ ./a.out 
1234asd
alx@debian:~/tmp$


TL;DR:

I'm not sure if this should EINVAL or EOVERFLOW, but other than that, I
think we're good.  Feel free to suggest that the page or the kernel is
wrong regarding errno.

Have a lovely day!
Alex


----

See where the kernel reports EINVAL or EOVERFLOW:


alx@debian:~/src/linux/linux/master$ find . -type f \
				| grep '\.c$' \
				| xargs grepc -tfld -m1 sendfile;
./fs/read_write.c:SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
{
	loff_t pos;
	off_t off;
	ssize_t ret;

	if (offset) {
		if (unlikely(get_user(off, offset)))
			return -EFAULT;
		pos = off;
		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
		if (unlikely(put_user(pos, offset)))
			return -EFAULT;
		return ret;
	}

	return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
alx@debian:~/src/linux/linux/master$ find fs/ -type f \
				| grep '\.c$' \
				| xargs grepc -tfd do_sendfile;
fs/read_write.c:static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
			   size_t count, loff_t max)
{
	struct fd in, out;
	struct inode *in_inode, *out_inode;
	struct pipe_inode_info *opipe;
	loff_t pos;
	loff_t out_pos;
	ssize_t retval;
	int fl;

	/*
	 * Get input file, and verify that it is ok..
	 */
	retval = -EBADF;
	in = fdget(in_fd);
	if (!in.file)
		goto out;
	if (!(in.file->f_mode & FMODE_READ))
		goto fput_in;
	retval = -ESPIPE;
	if (!ppos) {
		pos = in.file->f_pos;
	} else {
		pos = *ppos;
		if (!(in.file->f_mode & FMODE_PREAD))
			goto fput_in;
	}
	retval = rw_verify_area(READ, in.file, &pos, count);
	if (retval < 0)
		goto fput_in;
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;

	/*
	 * Get output file, and verify that it is ok..
	 */
	retval = -EBADF;
	out = fdget(out_fd);
	if (!out.file)
		goto fput_in;
	if (!(out.file->f_mode & FMODE_WRITE))
		goto fput_out;
	in_inode = file_inode(in.file);
	out_inode = file_inode(out.file);
	out_pos = out.file->f_pos;

	if (!max)
		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);

	if (unlikely(pos + count > max)) {
		retval = -EOVERFLOW;
		if (pos >= max)
			goto fput_out;
		count = max - pos;
	}

	fl = 0;
#if 0
	/*
	 * We need to debate whether we can enable this or not. The
	 * man page documents EAGAIN return for the output at least,
	 * and the application is arguably buggy if it doesn't expect
	 * EAGAIN on a non-blocking file descriptor.
	 */
	if (in.file->f_flags & O_NONBLOCK)
		fl = SPLICE_F_NONBLOCK;
#endif
	opipe = get_pipe_info(out.file, true);
	if (!opipe) {
		retval = rw_verify_area(WRITE, out.file, &out_pos, count);
		if (retval < 0)
			goto fput_out;
		retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
					  count, fl);
	} else {
		if (out.file->f_flags & O_NONBLOCK)
			fl |= SPLICE_F_NONBLOCK;

		retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
	}

	if (retval > 0) {
		add_rchar(current, retval);
		add_wchar(current, retval);
		fsnotify_access(in.file);
		fsnotify_modify(out.file);
		out.file->f_pos = out_pos;
		if (ppos)
			*ppos = pos;
		else
			in.file->f_pos = pos;
	}

	inc_syscr(current);
	inc_syscw(current);
	if (pos > max)
		retval = -EOVERFLOW;

fput_out:
	fdput(out);
fput_in:
	fdput(in);
out:
	return retval;
}
alx@debian:~/src/linux/linux/master$ find fs/ -type f \
				| grep '\.c$' \
				| xargs grepc -tfd rw_verify_area;
fs/read_write.c:int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
	int mask = read_write == READ ? MAY_READ : MAY_WRITE;
	int ret;

	if (unlikely((ssize_t) count < 0))
		return -EINVAL;

	if (ppos) {
		loff_t pos = *ppos;

		if (unlikely(pos < 0)) {
			if (!unsigned_offsets(file))
				return -EINVAL;
			if (count >= -pos) /* both values are in 0..LLONG_MAX */
				return -EOVERFLOW;
		} else if (unlikely((loff_t) (pos + count) < 0)) {
			if (!unsigned_offsets(file))
				return -EINVAL;
		}
	}

	ret = security_file_permission(file, mask);
	if (ret)
		return ret;

	return fsnotify_file_area_perm(file, mask, ppos, count);
}


-- 
<https://www.alejandro-colomar.es/>
Looking for a remote C programming job at the moment.

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux