Re: Update of file offset on write() etc. is non-atomic with I/O

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Ok, sorry for the long delay, I was distracted (and hoping that Al
would come up with a patch).

Anyway, attached is the patch I think we should do for this issue. It
is fairly simple:

 - it adds a "f_pos_mutex" to the "struct file".

 - it adds a new FMODE_ATOMIC_POS flag to the file mode flags to mark
things that need atomic f_pos updates

 - it makes the "struct fd" flags be two flags rather than one: the
second flag is for "unlock f_pos_mutex when done"

 - it introduces "fd[get,put]_pos()" which gets the f_pos_mutex when required

 - it makes read/write/lseek use that.

It's pretty damn straightforward, I think, and is minimally serializing.

Al, comments? Yongzhi Pan, this is pretty much untested, but it's
pretty simple and it does fix your test-case.

                       Linus

On Thu, Feb 20, 2014 at 9:14 AM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
> Yes, I do think we violate POSIX here because of how we handle f_pos
> (the earlier thread from 2006 you point to talks about the "thread
> safe" part, the point here about the actual wording of "atomic" is a
> separate issue).
>
> Long long ago we used to just pass in the pointer to f_pos directly,
> and then the low-level write would update it all under the inode
> semaphore, and all was good.
>
> And then we ended up having tons of problems with non-regular files
> and drivers accessing f_pos and having nasty races with it because
> they did *not* have any locking (and very fundamentally didn't want
> any), and we broke the serialization of f_pos. We still do the *IO*
> atomically, but yes, the f_pos access itself is outside the lock.
>
> Ho humm.. Al, any ideas of how to fix this?
 fs/file_table.c      |  1 +
 fs/open.c            |  4 ++++
 fs/read_write.c      | 46 ++++++++++++++++++++++++++++++++++++----------
 include/linux/file.h |  6 +++---
 include/linux/fs.h   |  6 +++++-
 5 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 5fff9030be34..5b24008ea4f6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
+	mutex_init(&f->f_pos_lock);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
diff --git a/fs/open.c b/fs/open.c
index 4b3e1edf2fe4..b9ed8b25c108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
 		return 0;
 	}
 
+	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+	if (S_ISREG(inode->i_mode))
+		f->f_mode |= FMODE_ATOMIC_POS;
+
 	f->f_op = fops_get(inode->i_fop);
 	if (unlikely(WARN_ON(!f->f_op))) {
 		error = -ENODEV;
diff --git a/fs/read_write.c b/fs/read_write.c
index edc5746a902a..9273ec92f07e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 }
 EXPORT_SYMBOL(vfs_llseek);
 
+/*
+ * We only lock f_pos if we have threads (in which case "need_put"
+ * will be set) or if the file might be shared with another process
+ * (in which case the file count is elevated).
+ */
+static struct fd fdget_pos(int fd)
+{
+	struct fd f = fdget(fd);
+	struct file *file = f.file;
+
+	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+		if (f.need_put || file_count(file) > 1) {
+			f.need_pos_unlock = 1;
+			mutex_lock(&file->f_pos_lock);
+		}
+	}
+	return f;
+}
+
+static void fdput_pos(struct fd f)
+{
+	if (f.need_pos_unlock)
+		mutex_unlock(&f.file->f_pos_lock);
+	fdput(f);
+}
+
 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 {
 	off_t retval;
-	struct fd f = fdget(fd);
+	struct fd f = fdget_pos(fd);
 	if (!f.file)
 		return -EBADF;
 
@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 		if (res != (loff_t)retval)
 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 	}
-	fdput(f);
+	fdput_pos(f);
 	return retval;
 }
 
@@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
 
 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
-	struct fd f = fdget(fd);
+	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
@@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 		ret = vfs_read(f.file, buf, count, &pos);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
-		fdput(f);
+		fdput_pos(f);
 	}
 	return ret;
 }
@@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 		size_t, count)
 {
-	struct fd f = fdget(fd);
+	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
@@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 		ret = vfs_write(f.file, buf, count, &pos);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
-		fdput(f);
+		fdput_pos(f);
 	}
 
 	return ret;
@@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev);
 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen)
 {
-	struct fd f = fdget(fd);
+	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
@@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 		ret = vfs_readv(f.file, vec, vlen, &pos);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
-		fdput(f);
+		fdput_pos(f);
 	}
 
 	if (ret > 0)
@@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen)
 {
-	struct fd f = fdget(fd);
+	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
 	if (f.file) {
@@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 		ret = vfs_writev(f.file, vec, vlen, &pos);
 		if (ret >= 0)
 			file_pos_write(f.file, pos);
-		fdput(f);
+		fdput_pos(f);
 	}
 
 	if (ret > 0)
diff --git a/include/linux/file.h b/include/linux/file.h
index cbacf4faf447..636634a09c92 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -28,7 +28,7 @@ static inline void fput_light(struct file *file, int fput_needed)
 
 struct fd {
 	struct file *file;
-	int need_put;
+	unsigned need_put:1, need_pos_unlock:1;
 };
 
 static inline void fdput(struct fd fd)
@@ -44,7 +44,7 @@ static inline struct fd fdget(unsigned int fd)
 {
 	int b;
 	struct file *f = fget_light(fd, &b);
-	return (struct fd){f,b};
+	return (struct fd){f,b,0};
 }
 
 extern struct file *fget_raw(unsigned int fd);
@@ -54,7 +54,7 @@ static inline struct fd fdget_raw(unsigned int fd)
 {
 	int b;
 	struct file *f = fget_raw_light(fd, &b);
-	return (struct fd){f,b};
+	return (struct fd){f,b,0};
 }
 
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60829565e552..ebfde04bca06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH		((__force fmode_t)0x4000)
 
+/* File needs atomic accesses to f_pos */
+#define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
@@ -780,13 +783,14 @@ struct file {
 	const struct file_operations	*f_op;
 
 	/*
-	 * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
+	 * Protects f_ep_links, f_flags.
 	 * Must not be taken from IRQ context.
 	 */
 	spinlock_t		f_lock;
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
+	struct mutex		f_pos_lock;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	const struct cred	*f_cred;

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux